From 483eb2f56657e8e7f419ab1a4fab8dce9ade8609 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 20:24:20 +0200 Subject: Adding upstream version 14.2.21. Signed-off-by: Daniel Baumann --- src/spdk/lib/Makefile | 58 + src/spdk/lib/bdev/Makefile | 60 + src/spdk/lib/bdev/aio/Makefile | 41 + src/spdk/lib/bdev/aio/bdev_aio.c | 751 +++ src/spdk/lib/bdev/aio/bdev_aio.h | 80 + src/spdk/lib/bdev/aio/bdev_aio_rpc.c | 160 + src/spdk/lib/bdev/bdev.c | 3950 ++++++++++++++ src/spdk/lib/bdev/crypto/Makefile | 42 + src/spdk/lib/bdev/crypto/vbdev_crypto.c | 1506 ++++++ src/spdk/lib/bdev/crypto/vbdev_crypto.h | 66 + src/spdk/lib/bdev/crypto/vbdev_crypto_rpc.c | 163 + src/spdk/lib/bdev/error/Makefile | 40 + src/spdk/lib/bdev/error/vbdev_error.c | 513 ++ src/spdk/lib/bdev/error/vbdev_error.h | 76 + src/spdk/lib/bdev/error/vbdev_error_rpc.c | 258 + src/spdk/lib/bdev/gpt/Makefile | 40 + src/spdk/lib/bdev/gpt/gpt.c | 239 + src/spdk/lib/bdev/gpt/gpt.h | 62 + src/spdk/lib/bdev/gpt/vbdev_gpt.c | 463 ++ src/spdk/lib/bdev/iscsi/Makefile | 46 + src/spdk/lib/bdev/iscsi/bdev_iscsi.c | 875 +++ src/spdk/lib/bdev/iscsi/bdev_iscsi.h | 75 + src/spdk/lib/bdev/iscsi/bdev_iscsi_rpc.c | 173 + src/spdk/lib/bdev/lvol/Makefile | 41 + src/spdk/lib/bdev/lvol/vbdev_lvol.c | 1321 +++++ src/spdk/lib/bdev/lvol/vbdev_lvol.h | 120 + src/spdk/lib/bdev/lvol/vbdev_lvol_rpc.c | 1089 ++++ src/spdk/lib/bdev/malloc/Makefile | 41 + src/spdk/lib/bdev/malloc/bdev_malloc.c | 524 ++ src/spdk/lib/bdev/malloc/bdev_malloc.h | 48 + src/spdk/lib/bdev/malloc/bdev_malloc_rpc.c | 170 + src/spdk/lib/bdev/null/Makefile | 40 + src/spdk/lib/bdev/null/bdev_null.c | 384 ++ src/spdk/lib/bdev/null/bdev_null.h | 57 + src/spdk/lib/bdev/null/bdev_null_rpc.c | 169 + src/spdk/lib/bdev/nvme/Makefile | 40 + src/spdk/lib/bdev/nvme/bdev_nvme.c | 1856 +++++++ src/spdk/lib/bdev/nvme/bdev_nvme.h | 112 + src/spdk/lib/bdev/nvme/bdev_nvme_rpc.c | 740 +++ src/spdk/lib/bdev/nvme/nvme_rpc.c | 487 ++ src/spdk/lib/bdev/part.c | 373 ++ src/spdk/lib/bdev/passthru/Makefile | 42 + src/spdk/lib/bdev/passthru/vbdev_passthru.c | 671 +++ src/spdk/lib/bdev/passthru/vbdev_passthru.h | 62 + src/spdk/lib/bdev/passthru/vbdev_passthru_rpc.c | 160 + src/spdk/lib/bdev/pmem/Makefile | 40 + src/spdk/lib/bdev/pmem/bdev_pmem.c | 465 ++ src/spdk/lib/bdev/pmem/bdev_pmem.h | 64 + src/spdk/lib/bdev/pmem/bdev_pmem_rpc.c | 350 ++ src/spdk/lib/bdev/raid/Makefile | 41 + src/spdk/lib/bdev/raid/bdev_raid.c | 1624 ++++++ src/spdk/lib/bdev/raid/bdev_raid.h | 225 + src/spdk/lib/bdev/raid/bdev_raid_rpc.c | 408 ++ src/spdk/lib/bdev/rbd/Makefile | 40 + src/spdk/lib/bdev/rbd/bdev_rbd.c | 740 +++ src/spdk/lib/bdev/rbd/bdev_rbd.h | 55 + src/spdk/lib/bdev/rbd/bdev_rbd_rpc.c | 157 + src/spdk/lib/bdev/rpc/Makefile | 40 + src/spdk/lib/bdev/rpc/bdev_rpc.c | 587 ++ src/spdk/lib/bdev/scsi_nvme.c | 261 + src/spdk/lib/bdev/split/Makefile | 40 + src/spdk/lib/bdev/split/vbdev_split.c | 565 ++ src/spdk/lib/bdev/split/vbdev_split.h | 68 + src/spdk/lib/bdev/split/vbdev_split_rpc.c | 151 + src/spdk/lib/bdev/virtio/Makefile | 40 + src/spdk/lib/bdev/virtio/bdev_virtio.h | 164 + src/spdk/lib/bdev/virtio/bdev_virtio_blk.c | 707 +++ src/spdk/lib/bdev/virtio/bdev_virtio_rpc.c | 613 +++ src/spdk/lib/bdev/virtio/bdev_virtio_scsi.c | 2017 +++++++ src/spdk/lib/bdev/vtune.c | 49 + src/spdk/lib/blob/Makefile | 42 + src/spdk/lib/blob/bdev/Makefile | 40 + src/spdk/lib/blob/bdev/blob_bdev.c | 357 ++ src/spdk/lib/blob/blob_bs_dev.c | 150 + src/spdk/lib/blob/blobstore.c | 5720 ++++++++++++++++++++ src/spdk/lib/blob/blobstore.h | 572 ++ src/spdk/lib/blob/request.c | 558 ++ src/spdk/lib/blob/request.h | 223 + src/spdk/lib/blob/zeroes.c | 122 + src/spdk/lib/blobfs/Makefile | 40 + src/spdk/lib/blobfs/blobfs.c | 2617 +++++++++ src/spdk/lib/blobfs/blobfs_internal.h | 69 + src/spdk/lib/blobfs/tree.c | 181 + src/spdk/lib/blobfs/tree.h | 77 + src/spdk/lib/conf/Makefile | 40 + src/spdk/lib/conf/conf.c | 684 +++ src/spdk/lib/copy/Makefile | 42 + src/spdk/lib/copy/copy_engine.c | 318 ++ src/spdk/lib/copy/ioat/Makefile | 40 + src/spdk/lib/copy/ioat/copy_engine_ioat.c | 421 ++ src/spdk/lib/copy/ioat/copy_engine_ioat.h | 44 + src/spdk/lib/copy/ioat/copy_engine_ioat_rpc.c | 118 + src/spdk/lib/env_dpdk/Makefile | 42 + src/spdk/lib/env_dpdk/env.c | 419 ++ src/spdk/lib/env_dpdk/env.mk | 112 + src/spdk/lib/env_dpdk/env_internal.h | 104 + src/spdk/lib/env_dpdk/init.c | 401 ++ src/spdk/lib/env_dpdk/memory.c | 712 +++ src/spdk/lib/env_dpdk/pci.c | 551 ++ src/spdk/lib/env_dpdk/pci_ioat.c | 123 + src/spdk/lib/env_dpdk/pci_nvme.c | 89 + src/spdk/lib/env_dpdk/pci_virtio.c | 80 + src/spdk/lib/env_dpdk/threads.c | 108 + src/spdk/lib/env_dpdk/vtophys.c | 691 +++ src/spdk/lib/event/Makefile | 42 + src/spdk/lib/event/app.c | 998 ++++ src/spdk/lib/event/reactor.c | 804 +++ src/spdk/lib/event/rpc.c | 82 + src/spdk/lib/event/rpc/Makefile | 40 + src/spdk/lib/event/rpc/app_rpc.c | 155 + src/spdk/lib/event/rpc/subsystem_rpc.c | 129 + src/spdk/lib/event/subsystem.c | 256 + src/spdk/lib/event/subsystems/Makefile | 44 + src/spdk/lib/event/subsystems/bdev/Makefile | 40 + src/spdk/lib/event/subsystems/bdev/bdev.c | 83 + src/spdk/lib/event/subsystems/bdev/bdev_rpc.c | 97 + src/spdk/lib/event/subsystems/copy/Makefile | 40 + src/spdk/lib/event/subsystems/copy/copy.c | 70 + src/spdk/lib/event/subsystems/iscsi/Makefile | 41 + src/spdk/lib/event/subsystems/iscsi/iscsi.c | 81 + src/spdk/lib/event/subsystems/iscsi/iscsi_rpc.c | 119 + src/spdk/lib/event/subsystems/nbd/Makefile | 40 + src/spdk/lib/event/subsystems/nbd/nbd.c | 74 + src/spdk/lib/event/subsystems/net/Makefile | 40 + src/spdk/lib/event/subsystems/net/net.c | 91 + src/spdk/lib/event/subsystems/nvmf/Makefile | 40 + src/spdk/lib/event/subsystems/nvmf/conf.c | 587 ++ src/spdk/lib/event/subsystems/nvmf/event_nvmf.h | 67 + src/spdk/lib/event/subsystems/nvmf/nvmf_rpc.c | 1562 ++++++ .../event/subsystems/nvmf/nvmf_rpc_deprecated.c | 620 +++ src/spdk/lib/event/subsystems/nvmf/nvmf_tgt.c | 503 ++ src/spdk/lib/event/subsystems/scsi/Makefile | 40 + src/spdk/lib/event/subsystems/scsi/scsi.c | 65 + src/spdk/lib/event/subsystems/vhost/Makefile | 40 + src/spdk/lib/event/subsystems/vhost/vhost.c | 71 + src/spdk/lib/ioat/Makefile | 40 + src/spdk/lib/ioat/ioat.c | 733 +++ src/spdk/lib/ioat/ioat_internal.h | 100 + src/spdk/lib/iscsi/Makefile | 45 + src/spdk/lib/iscsi/acceptor.c | 91 + src/spdk/lib/iscsi/acceptor.h | 43 + src/spdk/lib/iscsi/conn.c | 1470 +++++ src/spdk/lib/iscsi/conn.h | 193 + src/spdk/lib/iscsi/init_grp.c | 786 +++ src/spdk/lib/iscsi/init_grp.h | 79 + src/spdk/lib/iscsi/iscsi.c | 4583 ++++++++++++++++ src/spdk/lib/iscsi/iscsi.h | 467 ++ src/spdk/lib/iscsi/iscsi_rpc.c | 1542 ++++++ src/spdk/lib/iscsi/iscsi_subsystem.c | 1523 ++++++ src/spdk/lib/iscsi/md5.c | 75 + src/spdk/lib/iscsi/md5.h | 52 + src/spdk/lib/iscsi/param.c | 1182 ++++ src/spdk/lib/iscsi/param.h | 84 + src/spdk/lib/iscsi/portal_grp.c | 707 +++ src/spdk/lib/iscsi/portal_grp.h | 83 + src/spdk/lib/iscsi/task.c | 88 + src/spdk/lib/iscsi/task.h | 187 + src/spdk/lib/iscsi/tgt_node.c | 1538 ++++++ src/spdk/lib/iscsi/tgt_node.h | 146 + src/spdk/lib/json/Makefile | 40 + src/spdk/lib/json/json_parse.c | 668 +++ src/spdk/lib/json/json_util.c | 650 +++ src/spdk/lib/json/json_write.c | 687 +++ src/spdk/lib/jsonrpc/Makefile | 41 + src/spdk/lib/jsonrpc/jsonrpc_client.c | 213 + src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c | 284 + src/spdk/lib/jsonrpc/jsonrpc_internal.h | 149 + src/spdk/lib/jsonrpc/jsonrpc_server.c | 360 ++ src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c | 394 ++ src/spdk/lib/log/Makefile | 45 + src/spdk/lib/log/log.c | 189 + src/spdk/lib/log/log_flags.c | 196 + src/spdk/lib/log/rpc/Makefile | 40 + src/spdk/lib/log/rpc/log_rpc.c | 336 ++ src/spdk/lib/lvol/Makefile | 40 + src/spdk/lib/lvol/lvol.c | 1494 +++++ src/spdk/lib/nbd/Makefile | 40 + src/spdk/lib/nbd/nbd.c | 969 ++++ src/spdk/lib/nbd/nbd_internal.h | 52 + src/spdk/lib/nbd/nbd_rpc.c | 304 ++ src/spdk/lib/net/Makefile | 41 + src/spdk/lib/net/interface.c | 505 ++ src/spdk/lib/net/net_internal.h | 79 + src/spdk/lib/net/net_rpc.c | 180 + src/spdk/lib/nvme/Makefile | 61 + src/spdk/lib/nvme/nvme.c | 862 +++ src/spdk/lib/nvme/nvme_ctrlr.c | 2678 +++++++++ src/spdk/lib/nvme/nvme_ctrlr_cmd.c | 694 +++ src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c | 83 + src/spdk/lib/nvme/nvme_fabric.c | 340 ++ src/spdk/lib/nvme/nvme_internal.h | 1003 ++++ src/spdk/lib/nvme/nvme_ns.c | 360 ++ src/spdk/lib/nvme/nvme_ns_cmd.c | 1026 ++++ src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c | 232 + src/spdk/lib/nvme/nvme_pcie.c | 2142 ++++++++ src/spdk/lib/nvme/nvme_qpair.c | 663 +++ src/spdk/lib/nvme/nvme_quirks.c | 141 + src/spdk/lib/nvme/nvme_rdma.c | 1634 ++++++ src/spdk/lib/nvme/nvme_transport.c | 219 + src/spdk/lib/nvme/nvme_uevent.c | 214 + src/spdk/lib/nvme/nvme_uevent.h | 61 + src/spdk/lib/nvmf/Makefile | 63 + src/spdk/lib/nvmf/ctrlr.c | 1773 ++++++ src/spdk/lib/nvmf/ctrlr_bdev.c | 531 ++ src/spdk/lib/nvmf/ctrlr_discovery.c | 144 + src/spdk/lib/nvmf/nvmf.c | 1173 ++++ src/spdk/lib/nvmf/nvmf_fc.h | 871 +++ src/spdk/lib/nvmf/nvmf_internal.h | 333 ++ src/spdk/lib/nvmf/rdma.c | 2930 ++++++++++ src/spdk/lib/nvmf/request.c | 190 + src/spdk/lib/nvmf/subsystem.c | 1269 +++++ src/spdk/lib/nvmf/transport.c | 236 + src/spdk/lib/nvmf/transport.h | 200 + src/spdk/lib/rocksdb/env_spdk.cc | 764 +++ src/spdk/lib/rocksdb/spdk.rocksdb.mk | 70 + src/spdk/lib/rpc/Makefile | 40 + src/spdk/lib/rpc/rpc.c | 285 + src/spdk/lib/scsi/Makefile | 40 + src/spdk/lib/scsi/dev.c | 415 ++ src/spdk/lib/scsi/lun.c | 452 ++ src/spdk/lib/scsi/port.c | 96 + src/spdk/lib/scsi/scsi.c | 69 + src/spdk/lib/scsi/scsi_bdev.c | 2116 ++++++++ src/spdk/lib/scsi/scsi_internal.h | 160 + src/spdk/lib/scsi/scsi_rpc.c | 82 + src/spdk/lib/scsi/task.c | 256 + src/spdk/lib/sock/Makefile | 44 + src/spdk/lib/sock/net_framework.c | 70 + src/spdk/lib/sock/posix/Makefile | 40 + src/spdk/lib/sock/posix/posix.c | 604 +++ src/spdk/lib/sock/sock.c | 373 ++ src/spdk/lib/sock/vpp/Makefile | 41 + src/spdk/lib/sock/vpp/vpp.c | 663 +++ src/spdk/lib/thread/Makefile | 40 + src/spdk/lib/thread/thread.c | 768 +++ src/spdk/lib/trace/Makefile | 40 + src/spdk/lib/trace/trace.c | 168 + src/spdk/lib/trace/trace_flags.c | 179 + src/spdk/lib/ut_mock/Makefile | 40 + src/spdk/lib/ut_mock/mock.c | 45 + src/spdk/lib/util/Makefile | 41 + src/spdk/lib/util/base64.c | 228 + src/spdk/lib/util/bit_array.c | 313 ++ src/spdk/lib/util/cpuset.c | 320 ++ src/spdk/lib/util/crc16.c | 53 + src/spdk/lib/util/crc32.c | 66 + src/spdk/lib/util/crc32_ieee.c | 48 + src/spdk/lib/util/crc32c.c | 89 + src/spdk/lib/util/fd.c | 103 + src/spdk/lib/util/strerror_tls.c | 43 + src/spdk/lib/util/string.c | 405 ++ src/spdk/lib/util/uuid.c | 67 + src/spdk/lib/vhost/Makefile | 47 + src/spdk/lib/vhost/rte_vhost/Makefile | 46 + src/spdk/lib/vhost/rte_vhost/fd_man.c | 300 + src/spdk/lib/vhost/rte_vhost/fd_man.h | 69 + src/spdk/lib/vhost/rte_vhost/rte_vhost.h | 474 ++ src/spdk/lib/vhost/rte_vhost/socket.c | 819 +++ src/spdk/lib/vhost/rte_vhost/vhost.c | 482 ++ src/spdk/lib/vhost/rte_vhost/vhost.h | 321 ++ src/spdk/lib/vhost/rte_vhost/vhost_user.c | 1360 +++++ src/spdk/lib/vhost/rte_vhost/vhost_user.h | 182 + src/spdk/lib/vhost/vhost.c | 1503 +++++ src/spdk/lib/vhost/vhost_blk.c | 901 +++ src/spdk/lib/vhost/vhost_internal.h | 277 + src/spdk/lib/vhost/vhost_nvme.c | 1465 +++++ src/spdk/lib/vhost/vhost_rpc.c | 814 +++ src/spdk/lib/vhost/vhost_scsi.c | 1271 +++++ src/spdk/lib/virtio/Makefile | 42 + src/spdk/lib/virtio/virtio.c | 738 +++ src/spdk/lib/virtio/virtio_pci.c | 590 ++ src/spdk/lib/virtio/virtio_user.c | 621 +++ src/spdk/lib/virtio/virtio_user/vhost.h | 113 + src/spdk/lib/virtio/virtio_user/vhost_user.c | 518 ++ 274 files changed, 119311 insertions(+) create mode 100644 src/spdk/lib/Makefile create mode 100644 src/spdk/lib/bdev/Makefile create mode 100644 src/spdk/lib/bdev/aio/Makefile create mode 100644 src/spdk/lib/bdev/aio/bdev_aio.c create mode 100644 src/spdk/lib/bdev/aio/bdev_aio.h create mode 100644 src/spdk/lib/bdev/aio/bdev_aio_rpc.c create mode 100644 src/spdk/lib/bdev/bdev.c create mode 100644 src/spdk/lib/bdev/crypto/Makefile create mode 100644 src/spdk/lib/bdev/crypto/vbdev_crypto.c create mode 100644 src/spdk/lib/bdev/crypto/vbdev_crypto.h create mode 100644 src/spdk/lib/bdev/crypto/vbdev_crypto_rpc.c create mode 100644 src/spdk/lib/bdev/error/Makefile create mode 100644 src/spdk/lib/bdev/error/vbdev_error.c create mode 100644 src/spdk/lib/bdev/error/vbdev_error.h create mode 100644 src/spdk/lib/bdev/error/vbdev_error_rpc.c create mode 100644 src/spdk/lib/bdev/gpt/Makefile create mode 100644 src/spdk/lib/bdev/gpt/gpt.c create mode 100644 src/spdk/lib/bdev/gpt/gpt.h create mode 100644 src/spdk/lib/bdev/gpt/vbdev_gpt.c create mode 100644 src/spdk/lib/bdev/iscsi/Makefile create mode 100644 src/spdk/lib/bdev/iscsi/bdev_iscsi.c create mode 100644 src/spdk/lib/bdev/iscsi/bdev_iscsi.h create mode 100644 src/spdk/lib/bdev/iscsi/bdev_iscsi_rpc.c create mode 100644 src/spdk/lib/bdev/lvol/Makefile create mode 100644 src/spdk/lib/bdev/lvol/vbdev_lvol.c create mode 100644 src/spdk/lib/bdev/lvol/vbdev_lvol.h create mode 100644 src/spdk/lib/bdev/lvol/vbdev_lvol_rpc.c create mode 100644 src/spdk/lib/bdev/malloc/Makefile create mode 100644 src/spdk/lib/bdev/malloc/bdev_malloc.c create mode 100644 src/spdk/lib/bdev/malloc/bdev_malloc.h create mode 100644 src/spdk/lib/bdev/malloc/bdev_malloc_rpc.c create mode 100644 src/spdk/lib/bdev/null/Makefile create mode 100644 src/spdk/lib/bdev/null/bdev_null.c create mode 100644 src/spdk/lib/bdev/null/bdev_null.h create mode 100644 src/spdk/lib/bdev/null/bdev_null_rpc.c create mode 100644 src/spdk/lib/bdev/nvme/Makefile create mode 100644 src/spdk/lib/bdev/nvme/bdev_nvme.c create mode 100644 src/spdk/lib/bdev/nvme/bdev_nvme.h create mode 100644 src/spdk/lib/bdev/nvme/bdev_nvme_rpc.c create mode 100644 src/spdk/lib/bdev/nvme/nvme_rpc.c create mode 100644 src/spdk/lib/bdev/part.c create mode 100644 src/spdk/lib/bdev/passthru/Makefile create mode 100644 src/spdk/lib/bdev/passthru/vbdev_passthru.c create mode 100644 src/spdk/lib/bdev/passthru/vbdev_passthru.h create mode 100644 src/spdk/lib/bdev/passthru/vbdev_passthru_rpc.c create mode 100644 src/spdk/lib/bdev/pmem/Makefile create mode 100644 src/spdk/lib/bdev/pmem/bdev_pmem.c create mode 100644 src/spdk/lib/bdev/pmem/bdev_pmem.h create mode 100644 src/spdk/lib/bdev/pmem/bdev_pmem_rpc.c create mode 100644 src/spdk/lib/bdev/raid/Makefile create mode 100644 src/spdk/lib/bdev/raid/bdev_raid.c create mode 100644 src/spdk/lib/bdev/raid/bdev_raid.h create mode 100644 src/spdk/lib/bdev/raid/bdev_raid_rpc.c create mode 100644 src/spdk/lib/bdev/rbd/Makefile create mode 100644 src/spdk/lib/bdev/rbd/bdev_rbd.c create mode 100644 src/spdk/lib/bdev/rbd/bdev_rbd.h create mode 100644 src/spdk/lib/bdev/rbd/bdev_rbd_rpc.c create mode 100644 src/spdk/lib/bdev/rpc/Makefile create mode 100644 src/spdk/lib/bdev/rpc/bdev_rpc.c create mode 100644 src/spdk/lib/bdev/scsi_nvme.c create mode 100644 src/spdk/lib/bdev/split/Makefile create mode 100644 src/spdk/lib/bdev/split/vbdev_split.c create mode 100644 src/spdk/lib/bdev/split/vbdev_split.h create mode 100644 src/spdk/lib/bdev/split/vbdev_split_rpc.c create mode 100644 src/spdk/lib/bdev/virtio/Makefile create mode 100644 src/spdk/lib/bdev/virtio/bdev_virtio.h create mode 100644 src/spdk/lib/bdev/virtio/bdev_virtio_blk.c create mode 100644 src/spdk/lib/bdev/virtio/bdev_virtio_rpc.c create mode 100644 src/spdk/lib/bdev/virtio/bdev_virtio_scsi.c create mode 100644 src/spdk/lib/bdev/vtune.c create mode 100644 src/spdk/lib/blob/Makefile create mode 100644 src/spdk/lib/blob/bdev/Makefile create mode 100644 src/spdk/lib/blob/bdev/blob_bdev.c create mode 100644 src/spdk/lib/blob/blob_bs_dev.c create mode 100644 src/spdk/lib/blob/blobstore.c create mode 100644 src/spdk/lib/blob/blobstore.h create mode 100644 src/spdk/lib/blob/request.c create mode 100644 src/spdk/lib/blob/request.h create mode 100644 src/spdk/lib/blob/zeroes.c create mode 100644 src/spdk/lib/blobfs/Makefile create mode 100644 src/spdk/lib/blobfs/blobfs.c create mode 100644 src/spdk/lib/blobfs/blobfs_internal.h create mode 100644 src/spdk/lib/blobfs/tree.c create mode 100644 src/spdk/lib/blobfs/tree.h create mode 100644 src/spdk/lib/conf/Makefile create mode 100644 src/spdk/lib/conf/conf.c create mode 100644 src/spdk/lib/copy/Makefile create mode 100644 src/spdk/lib/copy/copy_engine.c create mode 100644 src/spdk/lib/copy/ioat/Makefile create mode 100644 src/spdk/lib/copy/ioat/copy_engine_ioat.c create mode 100644 src/spdk/lib/copy/ioat/copy_engine_ioat.h create mode 100644 src/spdk/lib/copy/ioat/copy_engine_ioat_rpc.c create mode 100644 src/spdk/lib/env_dpdk/Makefile create mode 100644 src/spdk/lib/env_dpdk/env.c create mode 100644 src/spdk/lib/env_dpdk/env.mk create mode 100644 src/spdk/lib/env_dpdk/env_internal.h create mode 100644 src/spdk/lib/env_dpdk/init.c create mode 100644 src/spdk/lib/env_dpdk/memory.c create mode 100644 src/spdk/lib/env_dpdk/pci.c create mode 100644 src/spdk/lib/env_dpdk/pci_ioat.c create mode 100644 src/spdk/lib/env_dpdk/pci_nvme.c create mode 100644 src/spdk/lib/env_dpdk/pci_virtio.c create mode 100644 src/spdk/lib/env_dpdk/threads.c create mode 100644 src/spdk/lib/env_dpdk/vtophys.c create mode 100644 src/spdk/lib/event/Makefile create mode 100644 src/spdk/lib/event/app.c create mode 100644 src/spdk/lib/event/reactor.c create mode 100644 src/spdk/lib/event/rpc.c create mode 100644 src/spdk/lib/event/rpc/Makefile create mode 100644 src/spdk/lib/event/rpc/app_rpc.c create mode 100644 src/spdk/lib/event/rpc/subsystem_rpc.c create mode 100644 src/spdk/lib/event/subsystem.c create mode 100644 src/spdk/lib/event/subsystems/Makefile create mode 100644 src/spdk/lib/event/subsystems/bdev/Makefile create mode 100644 src/spdk/lib/event/subsystems/bdev/bdev.c create mode 100644 src/spdk/lib/event/subsystems/bdev/bdev_rpc.c create mode 100644 src/spdk/lib/event/subsystems/copy/Makefile create mode 100644 src/spdk/lib/event/subsystems/copy/copy.c create mode 100644 src/spdk/lib/event/subsystems/iscsi/Makefile create mode 100644 src/spdk/lib/event/subsystems/iscsi/iscsi.c create mode 100644 src/spdk/lib/event/subsystems/iscsi/iscsi_rpc.c create mode 100644 src/spdk/lib/event/subsystems/nbd/Makefile create mode 100644 src/spdk/lib/event/subsystems/nbd/nbd.c create mode 100644 src/spdk/lib/event/subsystems/net/Makefile create mode 100644 src/spdk/lib/event/subsystems/net/net.c create mode 100644 src/spdk/lib/event/subsystems/nvmf/Makefile create mode 100644 src/spdk/lib/event/subsystems/nvmf/conf.c create mode 100644 src/spdk/lib/event/subsystems/nvmf/event_nvmf.h create mode 100644 src/spdk/lib/event/subsystems/nvmf/nvmf_rpc.c create mode 100644 src/spdk/lib/event/subsystems/nvmf/nvmf_rpc_deprecated.c create mode 100644 src/spdk/lib/event/subsystems/nvmf/nvmf_tgt.c create mode 100644 src/spdk/lib/event/subsystems/scsi/Makefile create mode 100644 src/spdk/lib/event/subsystems/scsi/scsi.c create mode 100644 src/spdk/lib/event/subsystems/vhost/Makefile create mode 100644 src/spdk/lib/event/subsystems/vhost/vhost.c create mode 100644 src/spdk/lib/ioat/Makefile create mode 100644 src/spdk/lib/ioat/ioat.c create mode 100644 src/spdk/lib/ioat/ioat_internal.h create mode 100644 src/spdk/lib/iscsi/Makefile create mode 100644 src/spdk/lib/iscsi/acceptor.c create mode 100644 src/spdk/lib/iscsi/acceptor.h create mode 100644 src/spdk/lib/iscsi/conn.c create mode 100644 src/spdk/lib/iscsi/conn.h create mode 100644 src/spdk/lib/iscsi/init_grp.c create mode 100644 src/spdk/lib/iscsi/init_grp.h create mode 100644 src/spdk/lib/iscsi/iscsi.c create mode 100644 src/spdk/lib/iscsi/iscsi.h create mode 100644 src/spdk/lib/iscsi/iscsi_rpc.c create mode 100644 src/spdk/lib/iscsi/iscsi_subsystem.c create mode 100644 src/spdk/lib/iscsi/md5.c create mode 100644 src/spdk/lib/iscsi/md5.h create mode 100644 src/spdk/lib/iscsi/param.c create mode 100644 src/spdk/lib/iscsi/param.h create mode 100644 src/spdk/lib/iscsi/portal_grp.c create mode 100644 src/spdk/lib/iscsi/portal_grp.h create mode 100644 src/spdk/lib/iscsi/task.c create mode 100644 src/spdk/lib/iscsi/task.h create mode 100644 src/spdk/lib/iscsi/tgt_node.c create mode 100644 src/spdk/lib/iscsi/tgt_node.h create mode 100644 src/spdk/lib/json/Makefile create mode 100644 src/spdk/lib/json/json_parse.c create mode 100644 src/spdk/lib/json/json_util.c create mode 100644 src/spdk/lib/json/json_write.c create mode 100644 src/spdk/lib/jsonrpc/Makefile create mode 100644 src/spdk/lib/jsonrpc/jsonrpc_client.c create mode 100644 src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c create mode 100644 src/spdk/lib/jsonrpc/jsonrpc_internal.h create mode 100644 src/spdk/lib/jsonrpc/jsonrpc_server.c create mode 100644 src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c create mode 100644 src/spdk/lib/log/Makefile create mode 100644 src/spdk/lib/log/log.c create mode 100644 src/spdk/lib/log/log_flags.c create mode 100644 src/spdk/lib/log/rpc/Makefile create mode 100644 src/spdk/lib/log/rpc/log_rpc.c create mode 100644 src/spdk/lib/lvol/Makefile create mode 100644 src/spdk/lib/lvol/lvol.c create mode 100644 src/spdk/lib/nbd/Makefile create mode 100644 src/spdk/lib/nbd/nbd.c create mode 100644 src/spdk/lib/nbd/nbd_internal.h create mode 100644 src/spdk/lib/nbd/nbd_rpc.c create mode 100644 src/spdk/lib/net/Makefile create mode 100644 src/spdk/lib/net/interface.c create mode 100644 src/spdk/lib/net/net_internal.h create mode 100644 src/spdk/lib/net/net_rpc.c create mode 100644 src/spdk/lib/nvme/Makefile create mode 100644 src/spdk/lib/nvme/nvme.c create mode 100644 src/spdk/lib/nvme/nvme_ctrlr.c create mode 100644 src/spdk/lib/nvme/nvme_ctrlr_cmd.c create mode 100644 src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c create mode 100644 src/spdk/lib/nvme/nvme_fabric.c create mode 100644 src/spdk/lib/nvme/nvme_internal.h create mode 100644 src/spdk/lib/nvme/nvme_ns.c create mode 100644 src/spdk/lib/nvme/nvme_ns_cmd.c create mode 100644 src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c create mode 100644 src/spdk/lib/nvme/nvme_pcie.c create mode 100644 src/spdk/lib/nvme/nvme_qpair.c create mode 100644 src/spdk/lib/nvme/nvme_quirks.c create mode 100644 src/spdk/lib/nvme/nvme_rdma.c create mode 100644 src/spdk/lib/nvme/nvme_transport.c create mode 100644 src/spdk/lib/nvme/nvme_uevent.c create mode 100644 src/spdk/lib/nvme/nvme_uevent.h create mode 100644 src/spdk/lib/nvmf/Makefile create mode 100644 src/spdk/lib/nvmf/ctrlr.c create mode 100644 src/spdk/lib/nvmf/ctrlr_bdev.c create mode 100644 src/spdk/lib/nvmf/ctrlr_discovery.c create mode 100644 src/spdk/lib/nvmf/nvmf.c create mode 100644 src/spdk/lib/nvmf/nvmf_fc.h create mode 100644 src/spdk/lib/nvmf/nvmf_internal.h create mode 100644 src/spdk/lib/nvmf/rdma.c create mode 100644 src/spdk/lib/nvmf/request.c create mode 100644 src/spdk/lib/nvmf/subsystem.c create mode 100644 src/spdk/lib/nvmf/transport.c create mode 100644 src/spdk/lib/nvmf/transport.h create mode 100644 src/spdk/lib/rocksdb/env_spdk.cc create mode 100644 src/spdk/lib/rocksdb/spdk.rocksdb.mk create mode 100644 src/spdk/lib/rpc/Makefile create mode 100644 src/spdk/lib/rpc/rpc.c create mode 100644 src/spdk/lib/scsi/Makefile create mode 100644 src/spdk/lib/scsi/dev.c create mode 100644 src/spdk/lib/scsi/lun.c create mode 100644 src/spdk/lib/scsi/port.c create mode 100644 src/spdk/lib/scsi/scsi.c create mode 100644 src/spdk/lib/scsi/scsi_bdev.c create mode 100644 src/spdk/lib/scsi/scsi_internal.h create mode 100644 src/spdk/lib/scsi/scsi_rpc.c create mode 100644 src/spdk/lib/scsi/task.c create mode 100644 src/spdk/lib/sock/Makefile create mode 100644 src/spdk/lib/sock/net_framework.c create mode 100644 src/spdk/lib/sock/posix/Makefile create mode 100644 src/spdk/lib/sock/posix/posix.c create mode 100644 src/spdk/lib/sock/sock.c create mode 100644 src/spdk/lib/sock/vpp/Makefile create mode 100644 src/spdk/lib/sock/vpp/vpp.c create mode 100644 src/spdk/lib/thread/Makefile create mode 100644 src/spdk/lib/thread/thread.c create mode 100644 src/spdk/lib/trace/Makefile create mode 100644 src/spdk/lib/trace/trace.c create mode 100644 src/spdk/lib/trace/trace_flags.c create mode 100644 src/spdk/lib/ut_mock/Makefile create mode 100644 src/spdk/lib/ut_mock/mock.c create mode 100644 src/spdk/lib/util/Makefile create mode 100644 src/spdk/lib/util/base64.c create mode 100644 src/spdk/lib/util/bit_array.c create mode 100644 src/spdk/lib/util/cpuset.c create mode 100644 src/spdk/lib/util/crc16.c create mode 100644 src/spdk/lib/util/crc32.c create mode 100644 src/spdk/lib/util/crc32_ieee.c create mode 100644 src/spdk/lib/util/crc32c.c create mode 100644 src/spdk/lib/util/fd.c create mode 100644 src/spdk/lib/util/strerror_tls.c create mode 100644 src/spdk/lib/util/string.c create mode 100644 src/spdk/lib/util/uuid.c create mode 100644 src/spdk/lib/vhost/Makefile create mode 100644 src/spdk/lib/vhost/rte_vhost/Makefile create mode 100644 src/spdk/lib/vhost/rte_vhost/fd_man.c create mode 100644 src/spdk/lib/vhost/rte_vhost/fd_man.h create mode 100644 src/spdk/lib/vhost/rte_vhost/rte_vhost.h create mode 100644 src/spdk/lib/vhost/rte_vhost/socket.c create mode 100644 src/spdk/lib/vhost/rte_vhost/vhost.c create mode 100644 src/spdk/lib/vhost/rte_vhost/vhost.h create mode 100644 src/spdk/lib/vhost/rte_vhost/vhost_user.c create mode 100644 src/spdk/lib/vhost/rte_vhost/vhost_user.h create mode 100644 src/spdk/lib/vhost/vhost.c create mode 100644 src/spdk/lib/vhost/vhost_blk.c create mode 100644 src/spdk/lib/vhost/vhost_internal.h create mode 100644 src/spdk/lib/vhost/vhost_nvme.c create mode 100644 src/spdk/lib/vhost/vhost_rpc.c create mode 100644 src/spdk/lib/vhost/vhost_scsi.c create mode 100644 src/spdk/lib/virtio/Makefile create mode 100644 src/spdk/lib/virtio/virtio.c create mode 100644 src/spdk/lib/virtio/virtio_pci.c create mode 100644 src/spdk/lib/virtio/virtio_user.c create mode 100644 src/spdk/lib/virtio/virtio_user/vhost.h create mode 100644 src/spdk/lib/virtio/virtio_user/vhost_user.c (limited to 'src/spdk/lib') diff --git a/src/spdk/lib/Makefile b/src/spdk/lib/Makefile new file mode 100644 index 00000000..8de59e3a --- /dev/null +++ b/src/spdk/lib/Makefile @@ -0,0 +1,58 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +DIRS-y += bdev blob blobfs conf copy event json jsonrpc \ + log lvol net rpc sock thread trace util nvme nvmf scsi ioat \ + ut_mock iscsi +ifeq ($(OS),Linux) +DIRS-y += nbd +DIRS-$(CONFIG_VHOST) += vhost +DIRS-$(CONFIG_VIRTIO) += virtio +endif + +# If CONFIG_ENV is pointing at a directory in lib, build it. +# Out-of-tree env implementations must be built separately by the user. +ENV_NAME := $(notdir $(CONFIG_ENV)) +ifeq ($(abspath $(CONFIG_ENV)),$(SPDK_ROOT_DIR)/lib/$(ENV_NAME)) +DIRS-y += $(ENV_NAME) +endif + +.PHONY: all clean $(DIRS-y) + +all: $(DIRS-y) +clean: $(DIRS-y) + +include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk diff --git a/src/spdk/lib/bdev/Makefile b/src/spdk/lib/bdev/Makefile new file mode 100644 index 00000000..a5d30a9c --- /dev/null +++ b/src/spdk/lib/bdev/Makefile @@ -0,0 +1,60 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +ifeq ($(CONFIG_VTUNE),y) +CFLAGS += -I$(CONFIG_VTUNE_DIR)/include -I$(CONFIG_VTUNE_DIR)/sdk/src/ittnotify +endif + +C_SRCS = bdev.c part.c scsi_nvme.c +C_SRCS-$(CONFIG_VTUNE) += vtune.c +LIBNAME = bdev + +DIRS-y += error gpt lvol malloc null nvme passthru raid rpc split + +ifeq ($(CONFIG_CRYPTO),y) +DIRS-y += crypto +endif + +ifeq ($(OS),Linux) +DIRS-y += aio +DIRS-$(CONFIG_ISCSI_INITIATOR) += iscsi +DIRS-$(CONFIG_VIRTIO) += virtio +DIRS-$(CONFIG_PMDK) += pmem +endif + +DIRS-$(CONFIG_RBD) += rbd + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/aio/Makefile b/src/spdk/lib/bdev/aio/Makefile new file mode 100644 index 00000000..7a39e3d2 --- /dev/null +++ b/src/spdk/lib/bdev/aio/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_aio.c bdev_aio_rpc.c +LIBNAME = bdev_aio +LOCAL_SYS_LIBS = -laio + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/aio/bdev_aio.c b/src/spdk/lib/bdev/aio/bdev_aio.c new file mode 100644 index 00000000..bb0289ed --- /dev/null +++ b/src/spdk/lib/bdev/aio/bdev_aio.c @@ -0,0 +1,751 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_aio.h" + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +static int bdev_aio_initialize(void); +static void bdev_aio_fini(void); +static void aio_free_disk(struct file_disk *fdisk); +static void bdev_aio_get_spdk_running_config(FILE *fp); +static TAILQ_HEAD(, file_disk) g_aio_disk_head; + +#define SPDK_AIO_QUEUE_DEPTH 128 +#define MAX_EVENTS_PER_POLL 32 + +static int +bdev_aio_get_ctx_size(void) +{ + return sizeof(struct bdev_aio_task); +} + +static struct spdk_bdev_module aio_if = { + .name = "aio", + .module_init = bdev_aio_initialize, + .module_fini = bdev_aio_fini, + .config_text = bdev_aio_get_spdk_running_config, + .get_ctx_size = bdev_aio_get_ctx_size, +}; + +struct bdev_aio_group_channel { + struct spdk_poller *poller; + int epfd; +}; + +SPDK_BDEV_MODULE_REGISTER(&aio_if) + +static int +bdev_aio_open(struct file_disk *disk) +{ + int fd; + + fd = open(disk->filename, O_RDWR | O_DIRECT); + if (fd < 0) { + /* Try without O_DIRECT for non-disk files */ + fd = open(disk->filename, O_RDWR); + if (fd < 0) { + SPDK_ERRLOG("open() failed (file:%s), errno %d: %s\n", + disk->filename, errno, spdk_strerror(errno)); + disk->fd = -1; + return -1; + } + } + + disk->fd = fd; + + return 0; +} + +static int +bdev_aio_close(struct file_disk *disk) +{ + int rc; + + if (disk->fd == -1) { + return 0; + } + + rc = close(disk->fd); + if (rc < 0) { + SPDK_ERRLOG("close() failed (fd=%d), errno %d: %s\n", + disk->fd, errno, spdk_strerror(errno)); + return -1; + } + + disk->fd = -1; + + return 0; +} + +static int64_t +bdev_aio_readv(struct file_disk *fdisk, struct spdk_io_channel *ch, + struct bdev_aio_task *aio_task, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t offset) +{ + struct iocb *iocb = &aio_task->iocb; + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + int rc; + + io_prep_preadv(iocb, fdisk->fd, iov, iovcnt, offset); + iocb->data = aio_task; + aio_task->len = nbytes; + io_set_eventfd(iocb, aio_ch->efd); + + SPDK_DEBUGLOG(SPDK_LOG_AIO, "read %d iovs size %lu to off: %#lx\n", + iovcnt, nbytes, offset); + + rc = io_submit(aio_ch->io_ctx, 1, &iocb); + if (rc < 0) { + if (rc == -EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); + } + return -1; + } + aio_ch->io_inflight++; + return nbytes; +} + +static int64_t +bdev_aio_writev(struct file_disk *fdisk, struct spdk_io_channel *ch, + struct bdev_aio_task *aio_task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + struct iocb *iocb = &aio_task->iocb; + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + int rc; + + io_prep_pwritev(iocb, fdisk->fd, iov, iovcnt, offset); + iocb->data = aio_task; + aio_task->len = len; + io_set_eventfd(iocb, aio_ch->efd); + + SPDK_DEBUGLOG(SPDK_LOG_AIO, "write %d iovs size %lu from off: %#lx\n", + iovcnt, len, offset); + + rc = io_submit(aio_ch->io_ctx, 1, &iocb); + if (rc < 0) { + if (rc == -EAGAIN) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), SPDK_BDEV_IO_STATUS_FAILED); + SPDK_ERRLOG("%s: io_submit returned %d\n", __func__, rc); + } + return -1; + } + aio_ch->io_inflight++; + return len; +} + +static void +bdev_aio_flush(struct file_disk *fdisk, struct bdev_aio_task *aio_task) +{ + int rc = fsync(fdisk->fd); + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), + rc == 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +} + +static int +bdev_aio_destruct(void *ctx) +{ + struct file_disk *fdisk = ctx; + int rc = 0; + + TAILQ_REMOVE(&g_aio_disk_head, fdisk, link); + rc = bdev_aio_close(fdisk); + if (rc < 0) { + SPDK_ERRLOG("bdev_aio_close() failed\n"); + } + return rc; +} + +static int +bdev_aio_initialize_io_channel(struct bdev_aio_io_channel *ch) +{ + ch->efd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (ch->efd == -1) { + SPDK_ERRLOG("Cannot create efd\n"); + return -1; + } + + if (io_setup(SPDK_AIO_QUEUE_DEPTH, &ch->io_ctx) < 0) { + close(ch->efd); + SPDK_ERRLOG("async I/O context setup failure\n"); + return -1; + } + + return 0; +} + +static int +bdev_aio_group_poll(void *arg) +{ + struct bdev_aio_group_channel *group_ch = arg; + struct bdev_aio_io_channel *ch; + int nr, i, j, rc, total_nr = 0; + enum spdk_bdev_io_status status; + struct bdev_aio_task *aio_task; + struct timespec timeout; + struct io_event events[SPDK_AIO_QUEUE_DEPTH]; + struct epoll_event epevents[MAX_EVENTS_PER_POLL]; + + timeout.tv_sec = 0; + timeout.tv_nsec = 0; + rc = epoll_wait(group_ch->epfd, epevents, MAX_EVENTS_PER_POLL, 0); + if (rc == -1) { + SPDK_ERRLOG("epoll_wait error(%d): %s on ch=%p\n", errno, spdk_strerror(errno), group_ch); + return -1; + } + + for (j = 0; j < rc; j++) { + ch = epevents[j].data.ptr; + nr = io_getevents(ch->io_ctx, 1, SPDK_AIO_QUEUE_DEPTH, + events, &timeout); + + if (nr < 0) { + SPDK_ERRLOG("Returned %d on bdev_aio_io_channel %p\n", nr, ch); + continue; + } + + total_nr += nr; + for (i = 0; i < nr; i++) { + aio_task = events[i].data; + if (events[i].res != aio_task->len) { + status = SPDK_BDEV_IO_STATUS_FAILED; + } else { + status = SPDK_BDEV_IO_STATUS_SUCCESS; + } + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(aio_task), status); + ch->io_inflight--; + } + } + + return total_nr; +} + +static void +_bdev_aio_get_io_inflight(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct bdev_aio_io_channel *aio_ch = spdk_io_channel_get_ctx(ch); + + if (aio_ch->io_inflight) { + spdk_for_each_channel_continue(i, -1); + return; + } + + spdk_for_each_channel_continue(i, 0); +} + +static int bdev_aio_reset_retry_timer(void *arg); + +static void +_bdev_aio_get_io_inflight_done(struct spdk_io_channel_iter *i, int status) +{ + struct file_disk *fdisk = spdk_io_channel_iter_get_ctx(i); + + if (status == -1) { + fdisk->reset_retry_timer = spdk_poller_register(bdev_aio_reset_retry_timer, fdisk, 500); + return; + } + + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(fdisk->reset_task), SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static int +bdev_aio_reset_retry_timer(void *arg) +{ + struct file_disk *fdisk = arg; + + if (fdisk->reset_retry_timer) { + spdk_poller_unregister(&fdisk->reset_retry_timer); + } + + spdk_for_each_channel(fdisk, + _bdev_aio_get_io_inflight, + fdisk, + _bdev_aio_get_io_inflight_done); + + return -1; +} + +static void +bdev_aio_reset(struct file_disk *fdisk, struct bdev_aio_task *aio_task) +{ + fdisk->reset_task = aio_task; + + bdev_aio_reset_retry_timer(fdisk); +} + +static void bdev_aio_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + bdev_aio_readv((struct file_disk *)bdev_io->bdev->ctxt, + ch, + (struct bdev_aio_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); +} + +static int _bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_aio_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_aio_writev((struct file_disk *)bdev_io->bdev->ctxt, + ch, + (struct bdev_aio_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_FLUSH: + bdev_aio_flush((struct file_disk *)bdev_io->bdev->ctxt, + (struct bdev_aio_task *)bdev_io->driver_ctx); + return 0; + + case SPDK_BDEV_IO_TYPE_RESET: + bdev_aio_reset((struct file_disk *)bdev_io->bdev->ctxt, + (struct bdev_aio_task *)bdev_io->driver_ctx); + return 0; + default: + return -1; + } +} + +static void bdev_aio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_aio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_aio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + default: + return false; + } +} + +static int +bdev_aio_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_io_channel *ch = ctx_buf; + struct bdev_aio_group_channel *group_ch_ctx; + struct epoll_event epevent; + + if (bdev_aio_initialize_io_channel(ch) != 0) { + return -1; + } + + ch->group_ch = spdk_get_io_channel(&aio_if); + group_ch_ctx = spdk_io_channel_get_ctx(ch->group_ch); + + epevent.events = EPOLLIN | EPOLLET; + epevent.data.ptr = ch; + if (epoll_ctl(group_ch_ctx->epfd, EPOLL_CTL_ADD, ch->efd, &epevent)) { + close(ch->efd); + io_destroy(ch->io_ctx); + spdk_put_io_channel(ch->group_ch); + SPDK_ERRLOG("epoll_ctl error\n"); + return -1; + } + return 0; +} + +static void +bdev_aio_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_io_channel *io_channel = ctx_buf; + struct bdev_aio_group_channel *group_ch_ctx; + struct epoll_event event; + + group_ch_ctx = spdk_io_channel_get_ctx(io_channel->group_ch); + epoll_ctl(group_ch_ctx->epfd, EPOLL_CTL_DEL, io_channel->efd, &event); + spdk_put_io_channel(io_channel->group_ch); + close(io_channel->efd); + io_destroy(io_channel->io_ctx); + +} + +static struct spdk_io_channel * +bdev_aio_get_io_channel(void *ctx) +{ + struct file_disk *fdisk = ctx; + + return spdk_get_io_channel(fdisk); +} + + +static int +bdev_aio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct file_disk *fdisk = ctx; + + spdk_json_write_name(w, "aio"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "filename"); + spdk_json_write_string(w, fdisk->filename); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_aio_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct file_disk *fdisk = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_aio_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + if (fdisk->block_size_override) { + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + } + spdk_json_write_named_string(w, "filename", fdisk->filename); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table aio_fn_table = { + .destruct = bdev_aio_destruct, + .submit_request = bdev_aio_submit_request, + .io_type_supported = bdev_aio_io_type_supported, + .get_io_channel = bdev_aio_get_io_channel, + .dump_info_json = bdev_aio_dump_info_json, + .write_config_json = bdev_aio_write_json_config, +}; + +static void aio_free_disk(struct file_disk *fdisk) +{ + if (fdisk == NULL) { + return; + } + free(fdisk->filename); + free(fdisk->disk.name); + free(fdisk); +} + +static int +bdev_aio_group_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_group_channel *ch = ctx_buf; + + ch->epfd = epoll_create1(0); + if (ch->epfd == -1) { + SPDK_ERRLOG("cannot create epoll fd\n"); + return -1; + } + + ch->poller = spdk_poller_register(bdev_aio_group_poll, ch, 0); + return 0; +} + +static void +bdev_aio_group_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_aio_group_channel *ch = ctx_buf; + + close(ch->epfd); + spdk_poller_unregister(&ch->poller); +} + +struct spdk_bdev * +create_aio_disk(const char *name, const char *filename, uint32_t block_size) +{ + struct file_disk *fdisk; + uint32_t detected_block_size; + uint64_t disk_size; + int rc; + + fdisk = calloc(1, sizeof(*fdisk)); + if (!fdisk) { + SPDK_ERRLOG("Unable to allocate enough memory for aio backend\n"); + return NULL; + } + + fdisk->filename = strdup(filename); + if (!fdisk->filename) { + goto error_return; + } + + if (bdev_aio_open(fdisk)) { + SPDK_ERRLOG("Unable to open file %s. fd: %d errno: %d\n", filename, fdisk->fd, errno); + goto error_return; + } + + disk_size = spdk_fd_get_size(fdisk->fd); + + fdisk->disk.name = strdup(name); + if (!fdisk->disk.name) { + goto error_return; + } + fdisk->disk.product_name = "AIO disk"; + fdisk->disk.module = &aio_if; + + fdisk->disk.need_aligned_buffer = 1; + fdisk->disk.write_cache = 1; + + detected_block_size = spdk_fd_get_blocklen(fdisk->fd); + if (block_size == 0) { + /* User did not specify block size - use autodetected block size. */ + if (detected_block_size == 0) { + SPDK_ERRLOG("Block size could not be auto-detected\n"); + goto error_return; + } + fdisk->block_size_override = false; + block_size = detected_block_size; + } else { + if (block_size < detected_block_size) { + SPDK_ERRLOG("Specified block size %" PRIu32 " is smaller than " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + goto error_return; + } else if (detected_block_size != 0 && block_size != detected_block_size) { + SPDK_WARNLOG("Specified block size %" PRIu32 " does not match " + "auto-detected block size %" PRIu32 "\n", + block_size, detected_block_size); + } + fdisk->block_size_override = true; + } + + if (block_size < 512) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be at least 512).\n", block_size); + goto error_return; + } + + if (!spdk_u32_is_pow2(block_size)) { + SPDK_ERRLOG("Invalid block size %" PRIu32 " (must be a power of 2.)\n", block_size); + goto error_return; + } + + fdisk->disk.blocklen = block_size; + + if (disk_size % fdisk->disk.blocklen != 0) { + SPDK_ERRLOG("Disk size %" PRIu64 " is not a multiple of block size %" PRIu32 "\n", + disk_size, fdisk->disk.blocklen); + goto error_return; + } + + fdisk->disk.blockcnt = disk_size / fdisk->disk.blocklen; + fdisk->disk.ctxt = fdisk; + + fdisk->disk.fn_table = &aio_fn_table; + + spdk_io_device_register(fdisk, bdev_aio_create_cb, bdev_aio_destroy_cb, + sizeof(struct bdev_aio_io_channel), + fdisk->disk.name); + rc = spdk_bdev_register(&fdisk->disk); + if (rc) { + spdk_io_device_unregister(fdisk, NULL); + goto error_return; + } + + TAILQ_INSERT_TAIL(&g_aio_disk_head, fdisk, link); + return &fdisk->disk; + +error_return: + bdev_aio_close(fdisk); + aio_free_disk(fdisk); + return NULL; +} + +static void +aio_io_device_unregister_cb(void *io_device) +{ + struct file_disk *fdisk = io_device; + spdk_delete_aio_complete cb_fn = fdisk->delete_cb_fn; + void *cb_arg = fdisk->delete_cb_arg; + + aio_free_disk(fdisk); + cb_fn(cb_arg, 0); +} + +static void +aio_bdev_unregister_cb(void *arg, int bdeverrno) +{ + struct file_disk *fdisk = arg; + + if (bdeverrno != 0) { + fdisk->delete_cb_fn(fdisk->delete_cb_arg, bdeverrno); + return; + } + + spdk_io_device_unregister(fdisk, aio_io_device_unregister_cb); +} + +void +delete_aio_disk(struct spdk_bdev *bdev, spdk_delete_aio_complete cb_fn, void *cb_arg) +{ + struct file_disk *fdisk; + + if (!bdev || bdev->module != &aio_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + fdisk = bdev->ctxt; + fdisk->delete_cb_fn = cb_fn; + fdisk->delete_cb_arg = cb_arg; + spdk_bdev_unregister(bdev, aio_bdev_unregister_cb, fdisk); +} + +static int +bdev_aio_initialize(void) +{ + size_t i; + struct spdk_conf_section *sp; + struct spdk_bdev *bdev; + + TAILQ_INIT(&g_aio_disk_head); + spdk_io_device_register(&aio_if, bdev_aio_group_create_cb, bdev_aio_group_destroy_cb, + sizeof(struct bdev_aio_group_channel), + "aio_module"); + + sp = spdk_conf_find_section(NULL, "AIO"); + if (!sp) { + return 0; + } + + i = 0; + while (true) { + const char *file; + const char *name; + const char *block_size_str; + uint32_t block_size = 0; + + file = spdk_conf_section_get_nmval(sp, "AIO", i, 0); + if (!file) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "AIO", i, 1); + if (!name) { + SPDK_ERRLOG("No name provided for AIO disk with file %s\n", file); + i++; + continue; + } + + block_size_str = spdk_conf_section_get_nmval(sp, "AIO", i, 2); + if (block_size_str) { + block_size = atoi(block_size_str); + } + + bdev = create_aio_disk(name, file, block_size); + if (!bdev) { + SPDK_ERRLOG("Unable to create AIO bdev from file %s\n", file); + i++; + continue; + } + + i++; + } + + return 0; +} + +static void +bdev_aio_fini(void) +{ + spdk_io_device_unregister(&aio_if, NULL); +} + +static void +bdev_aio_get_spdk_running_config(FILE *fp) +{ + char *file; + char *name; + uint32_t block_size; + struct file_disk *fdisk; + + fprintf(fp, + "\n" + "# Users must change this section to match the /dev/sdX devices to be\n" + "# exported as iSCSI LUNs. The devices are accessed using Linux AIO.\n" + "# The format is:\n" + "# AIO []\n" + "# The file name is the backing device\n" + "# The bdev name can be referenced from elsewhere in the configuration file.\n" + "# Block size may be omitted to automatically detect the block size of a disk.\n" + "[AIO]\n"); + + TAILQ_FOREACH(fdisk, &g_aio_disk_head, link) { + file = fdisk->filename; + name = fdisk->disk.name; + block_size = fdisk->disk.blocklen; + fprintf(fp, " AIO %s %s ", file, name); + if (fdisk->block_size_override) { + fprintf(fp, "%d", block_size); + } + fprintf(fp, "\n"); + } + fprintf(fp, "\n"); +} + +SPDK_LOG_REGISTER_COMPONENT("aio", SPDK_LOG_AIO) diff --git a/src/spdk/lib/bdev/aio/bdev_aio.h b/src/spdk/lib/bdev/aio/bdev_aio.h new file mode 100644 index 00000000..f58e9822 --- /dev/null +++ b/src/spdk/lib/bdev/aio/bdev_aio.h @@ -0,0 +1,80 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_AIO_H +#define SPDK_BDEV_AIO_H + +#include "spdk/stdinc.h" + +#include +#include +#include + +#include "spdk/queue.h" +#include "spdk/bdev.h" + +#include "spdk/bdev_module.h" + +struct bdev_aio_task { + struct iocb iocb; + uint64_t len; + TAILQ_ENTRY(bdev_aio_task) link; +}; + +struct bdev_aio_io_channel { + io_context_t io_ctx; + uint64_t io_inflight; + struct spdk_io_channel *group_ch; + TAILQ_ENTRY(bdev_aio_io_channel) link; + int efd; +}; + +typedef void (*spdk_delete_aio_complete)(void *cb_arg, int bdeverrno); + +struct file_disk { + struct bdev_aio_task *reset_task; + struct spdk_poller *reset_retry_timer; + struct spdk_bdev disk; + char *filename; + int fd; + TAILQ_ENTRY(file_disk) link; + bool block_size_override; + spdk_delete_aio_complete delete_cb_fn; + void *delete_cb_arg; +}; + +struct spdk_bdev *create_aio_disk(const char *name, const char *filename, uint32_t block_size); + +void delete_aio_disk(struct spdk_bdev *bdev, spdk_delete_aio_complete cb_fn, void *cb_arg); + +#endif // SPDK_BDEV_AIO_H diff --git a/src/spdk/lib/bdev/aio/bdev_aio_rpc.c b/src/spdk/lib/bdev/aio/bdev_aio_rpc.c new file mode 100644 index 00000000..10dd237a --- /dev/null +++ b/src/spdk/lib/bdev/aio/bdev_aio_rpc.c @@ -0,0 +1,160 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_aio.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_construct_aio { + char *name; + char *filename; + uint32_t block_size; +}; + +static void +free_rpc_construct_aio(struct rpc_construct_aio *req) +{ + free(req->name); + free(req->filename); +} + +static const struct spdk_json_object_decoder rpc_construct_aio_decoders[] = { + {"name", offsetof(struct rpc_construct_aio, name), spdk_json_decode_string}, + {"filename", offsetof(struct rpc_construct_aio, filename), spdk_json_decode_string, true}, + {"block_size", offsetof(struct rpc_construct_aio, block_size), spdk_json_decode_uint32, true}, +}; + +static void +spdk_rpc_construct_aio_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_aio req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_construct_aio_decoders, + SPDK_COUNTOF(rpc_construct_aio_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.filename == NULL) { + goto invalid; + } + + bdev = create_aio_disk(req.name, req.filename, req.block_size); + if (bdev == NULL) { + goto invalid; + } + + free_rpc_construct_aio(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_construct_aio(&req); +} +SPDK_RPC_REGISTER("construct_aio_bdev", spdk_rpc_construct_aio_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_aio { + char *name; +}; + +static void +free_rpc_delete_aio(struct rpc_delete_aio *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_aio_decoders[] = { + {"name", offsetof(struct rpc_delete_aio, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_aio_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_aio_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_aio req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_aio_decoders, + SPDK_COUNTOF(rpc_delete_aio_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + delete_aio_disk(bdev, _spdk_rpc_delete_aio_bdev_cb, request); + + free_rpc_delete_aio(&req); + + return; + +invalid: + free_rpc_delete_aio(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_aio_bdev", spdk_rpc_delete_aio_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/bdev.c b/src/spdk/lib/bdev/bdev.c new file mode 100644 index 00000000..ab82fffd --- /dev/null +++ b/src/spdk/lib/bdev/bdev.c @@ -0,0 +1,3950 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" + +#include "spdk/config.h" +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/nvme_spec.h" +#include "spdk/scsi_spec.h" +#include "spdk/util.h" +#include "spdk/trace.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" + +#ifdef SPDK_CONFIG_VTUNE +#include "ittnotify.h" +#include "ittnotify_types.h" +int __itt_init_ittlib(const char *, __itt_group_id); +#endif + +#define SPDK_BDEV_IO_POOL_SIZE (64 * 1024) +#define SPDK_BDEV_IO_CACHE_SIZE 256 +#define BUF_SMALL_POOL_SIZE 8192 +#define BUF_LARGE_POOL_SIZE 1024 +#define NOMEM_THRESHOLD_COUNT 8 +#define ZERO_BUFFER_SIZE 0x100000 + +#define OWNER_BDEV 0x2 + +#define OBJECT_BDEV_IO 0x2 + +#define TRACE_GROUP_BDEV 0x3 +#define TRACE_BDEV_IO_START SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x0) +#define TRACE_BDEV_IO_DONE SPDK_TPOINT_ID(TRACE_GROUP_BDEV, 0x1) + +#define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 +#define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 +#define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 +#define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 10000 +#define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (10 * 1024 * 1024) +#define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX + +static const char *qos_conf_type[] = {"Limit_IOPS", "Limit_BPS"}; +static const char *qos_rpc_type[] = {"rw_ios_per_sec", "rw_mbytes_per_sec"}; + +TAILQ_HEAD(spdk_bdev_list, spdk_bdev); + +struct spdk_bdev_mgr { + struct spdk_mempool *bdev_io_pool; + + struct spdk_mempool *buf_small_pool; + struct spdk_mempool *buf_large_pool; + + void *zero_buffer; + + TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; + + struct spdk_bdev_list bdevs; + + bool init_complete; + bool module_init_complete; + +#ifdef SPDK_CONFIG_VTUNE + __itt_domain *domain; +#endif +}; + +static struct spdk_bdev_mgr g_bdev_mgr = { + .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), + .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), + .init_complete = false, + .module_init_complete = false, +}; + +static struct spdk_bdev_opts g_bdev_opts = { + .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, + .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, +}; + +static spdk_bdev_init_cb g_init_cb_fn = NULL; +static void *g_init_cb_arg = NULL; + +static spdk_bdev_fini_cb g_fini_cb_fn = NULL; +static void *g_fini_cb_arg = NULL; +static struct spdk_thread *g_fini_thread = NULL; + +struct spdk_bdev_qos_limit { + /** IOs or bytes allowed per second (i.e., 1s). */ + uint64_t limit; + + /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). + * For remaining bytes, allowed to run negative if an I/O is submitted when + * some bytes are remaining, but the I/O is bigger than that amount. The + * excess will be deducted from the next timeslice. + */ + int64_t remaining_this_timeslice; + + /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ + uint32_t min_per_timeslice; + + /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ + uint32_t max_per_timeslice; +}; + +struct spdk_bdev_qos { + /** Types of structure of rate limits. */ + struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + + /** The channel that all I/O are funneled through. */ + struct spdk_bdev_channel *ch; + + /** The thread on which the poller is running. */ + struct spdk_thread *thread; + + /** Queue of I/O waiting to be issued. */ + bdev_io_tailq_t queued; + + /** Size of a timeslice in tsc ticks. */ + uint64_t timeslice_size; + + /** Timestamp of start of last timeslice. */ + uint64_t last_timeslice; + + /** Poller that processes queued I/O commands each time slice. */ + struct spdk_poller *poller; +}; + +struct spdk_bdev_mgmt_channel { + bdev_io_stailq_t need_buf_small; + bdev_io_stailq_t need_buf_large; + + /* + * Each thread keeps a cache of bdev_io - this allows + * bdev threads which are *not* DPDK threads to still + * benefit from a per-thread bdev_io cache. Without + * this, non-DPDK threads fetching from the mempool + * incur a cmpxchg on get and put. + */ + bdev_io_stailq_t per_thread_cache; + uint32_t per_thread_cache_count; + uint32_t bdev_io_cache_size; + + TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; + TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; +}; + +/* + * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device + * will queue here their IO that awaits retry. It makes it possible to retry sending + * IO to one bdev after IO from other bdev completes. + */ +struct spdk_bdev_shared_resource { + /* The bdev management channel */ + struct spdk_bdev_mgmt_channel *mgmt_ch; + + /* + * Count of I/O submitted to bdev module and waiting for completion. + * Incremented before submit_request() is called on an spdk_bdev_io. + */ + uint64_t io_outstanding; + + /* + * Queue of IO awaiting retry because of a previous NOMEM status returned + * on this channel. + */ + bdev_io_tailq_t nomem_io; + + /* + * Threshold which io_outstanding must drop to before retrying nomem_io. + */ + uint64_t nomem_threshold; + + /* I/O channel allocated by a bdev module */ + struct spdk_io_channel *shared_ch; + + /* Refcount of bdev channels using this resource */ + uint32_t ref; + + TAILQ_ENTRY(spdk_bdev_shared_resource) link; +}; + +#define BDEV_CH_RESET_IN_PROGRESS (1 << 0) +#define BDEV_CH_QOS_ENABLED (1 << 1) + +struct spdk_bdev_channel { + struct spdk_bdev *bdev; + + /* The channel for the underlying device */ + struct spdk_io_channel *channel; + + /* Per io_device per thread data */ + struct spdk_bdev_shared_resource *shared_resource; + + struct spdk_bdev_io_stat stat; + + /* + * Count of I/O submitted through this channel and waiting for completion. + * Incremented before submit_request() is called on an spdk_bdev_io. + */ + uint64_t io_outstanding; + + bdev_io_tailq_t queued_resets; + + uint32_t flags; + +#ifdef SPDK_CONFIG_VTUNE + uint64_t start_tsc; + uint64_t interval_tsc; + __itt_string_handle *handle; + struct spdk_bdev_io_stat prev_stat; +#endif + +}; + +struct spdk_bdev_desc { + struct spdk_bdev *bdev; + struct spdk_thread *thread; + spdk_bdev_remove_cb_t remove_cb; + void *remove_ctx; + bool remove_scheduled; + bool closed; + bool write; + TAILQ_ENTRY(spdk_bdev_desc) link; +}; + +struct spdk_bdev_iostat_ctx { + struct spdk_bdev_io_stat *stat; + spdk_bdev_get_device_stat_cb cb; + void *cb_arg; +}; + +#define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) +#define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) + +static void _spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg); +static void _spdk_bdev_write_zero_buffer_next(void *_bdev_io); + +void +spdk_bdev_get_opts(struct spdk_bdev_opts *opts) +{ + *opts = g_bdev_opts; +} + +int +spdk_bdev_set_opts(struct spdk_bdev_opts *opts) +{ + uint32_t min_pool_size; + + /* + * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem + * initialization. A second mgmt_ch will be created on the same thread when the application starts + * but before the deferred put_io_channel event is executed for the first mgmt_ch. + */ + min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); + if (opts->bdev_io_pool_size < min_pool_size) { + SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 + " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, + spdk_thread_get_count()); + SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); + return -1; + } + + g_bdev_opts = *opts; + return 0; +} + +struct spdk_bdev * +spdk_bdev_first(void) +{ + struct spdk_bdev *bdev; + + bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_next(struct spdk_bdev *prev) +{ + struct spdk_bdev *bdev; + + bdev = TAILQ_NEXT(prev, internal.link); + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +static struct spdk_bdev * +_bdev_next_leaf(struct spdk_bdev *bdev) +{ + while (bdev != NULL) { + if (bdev->internal.claim_module == NULL) { + return bdev; + } else { + bdev = TAILQ_NEXT(bdev, internal.link); + } + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_first_leaf(void) +{ + struct spdk_bdev *bdev; + + bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); + + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Starting bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_next_leaf(struct spdk_bdev *prev) +{ + struct spdk_bdev *bdev; + + bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); + + if (bdev) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Continuing bdev iteration at %s\n", bdev->name); + } + + return bdev; +} + +struct spdk_bdev * +spdk_bdev_get_by_name(const char *bdev_name) +{ + struct spdk_bdev_alias *tmp; + struct spdk_bdev *bdev = spdk_bdev_first(); + + while (bdev != NULL) { + if (strcmp(bdev_name, bdev->name) == 0) { + return bdev; + } + + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (strcmp(bdev_name, tmp->alias) == 0) { + return bdev; + } + } + + bdev = spdk_bdev_next(bdev); + } + + return NULL; +} + +void +spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) +{ + struct iovec *iovs; + + iovs = bdev_io->u.bdev.iovs; + + assert(iovs != NULL); + assert(bdev_io->u.bdev.iovcnt >= 1); + + iovs[0].iov_base = buf; + iovs[0].iov_len = len; +} + +static void +spdk_bdev_io_put_buf(struct spdk_bdev_io *bdev_io) +{ + struct spdk_mempool *pool; + struct spdk_bdev_io *tmp; + void *buf, *aligned_buf; + bdev_io_stailq_t *stailq; + struct spdk_bdev_mgmt_channel *ch; + + assert(bdev_io->u.bdev.iovcnt == 1); + + buf = bdev_io->internal.buf; + ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + bdev_io->internal.buf = NULL; + + if (bdev_io->internal.buf_len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { + pool = g_bdev_mgr.buf_small_pool; + stailq = &ch->need_buf_small; + } else { + pool = g_bdev_mgr.buf_large_pool; + stailq = &ch->need_buf_large; + } + + if (STAILQ_EMPTY(stailq)) { + spdk_mempool_put(pool, buf); + } else { + tmp = STAILQ_FIRST(stailq); + + aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL); + spdk_bdev_io_set_buf(tmp, aligned_buf, tmp->internal.buf_len); + + STAILQ_REMOVE_HEAD(stailq, internal.buf_link); + tmp->internal.buf = buf; + tmp->internal.get_buf_cb(tmp->internal.ch->channel, tmp); + } +} + +void +spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) +{ + struct spdk_mempool *pool; + bdev_io_stailq_t *stailq; + void *buf, *aligned_buf; + struct spdk_bdev_mgmt_channel *mgmt_ch; + + assert(cb != NULL); + assert(bdev_io->u.bdev.iovs != NULL); + + if (spdk_unlikely(bdev_io->u.bdev.iovs[0].iov_base != NULL)) { + /* Buffer already present */ + cb(bdev_io->internal.ch->channel, bdev_io); + return; + } + + assert(len <= SPDK_BDEV_LARGE_BUF_MAX_SIZE); + mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + bdev_io->internal.buf_len = len; + bdev_io->internal.get_buf_cb = cb; + if (len <= SPDK_BDEV_SMALL_BUF_MAX_SIZE) { + pool = g_bdev_mgr.buf_small_pool; + stailq = &mgmt_ch->need_buf_small; + } else { + pool = g_bdev_mgr.buf_large_pool; + stailq = &mgmt_ch->need_buf_large; + } + + buf = spdk_mempool_get(pool); + + if (!buf) { + STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); + } else { + aligned_buf = (void *)(((uintptr_t)buf + 511) & ~511UL); + spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); + + bdev_io->internal.buf = buf; + bdev_io->internal.get_buf_cb(bdev_io->internal.ch->channel, bdev_io); + } +} + +static int +spdk_bdev_module_get_max_ctx_size(void) +{ + struct spdk_bdev_module *bdev_module; + int max_bdev_module_size = 0; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { + max_bdev_module_size = bdev_module->get_ctx_size(); + } + } + + return max_bdev_module_size; +} + +void +spdk_bdev_config_text(FILE *fp) +{ + struct spdk_bdev_module *bdev_module; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->config_text) { + bdev_module->config_text(fp); + } + } +} + +static void +spdk_bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + int i; + struct spdk_bdev_qos *qos = bdev->internal.qos; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + + if (!qos) { + return; + } + + spdk_bdev_get_qos_rate_limits(bdev, limits); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_bdev_qos_limit"); + spdk_json_write_name(w, "params"); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", bdev->name); + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] > 0) { + spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); + } + } + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +void +spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_module *bdev_module; + struct spdk_bdev *bdev; + + assert(w != NULL); + + spdk_json_write_array_begin(w); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_bdev_options"); + spdk_json_write_name(w, "params"); + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); + spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (bdev_module->config_json) { + bdev_module->config_json(w); + } + } + + TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { + spdk_bdev_qos_config_json(bdev, w); + + if (bdev->fn_table->write_config_json) { + bdev->fn_table->write_config_json(bdev, w); + } + } + + spdk_json_write_array_end(w); +} + +static int +spdk_bdev_mgmt_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_mgmt_channel *ch = ctx_buf; + struct spdk_bdev_io *bdev_io; + uint32_t i; + + STAILQ_INIT(&ch->need_buf_small); + STAILQ_INIT(&ch->need_buf_large); + + STAILQ_INIT(&ch->per_thread_cache); + ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; + + /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ + ch->per_thread_cache_count = 0; + for (i = 0; i < ch->bdev_io_cache_size; i++) { + bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); + assert(bdev_io != NULL); + ch->per_thread_cache_count++; + STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); + } + + TAILQ_INIT(&ch->shared_resources); + TAILQ_INIT(&ch->io_wait_queue); + + return 0; +} + +static void +spdk_bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_mgmt_channel *ch = ctx_buf; + struct spdk_bdev_io *bdev_io; + + if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { + SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); + } + + if (!TAILQ_EMPTY(&ch->shared_resources)) { + SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); + } + + while (!STAILQ_EMPTY(&ch->per_thread_cache)) { + bdev_io = STAILQ_FIRST(&ch->per_thread_cache); + STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); + ch->per_thread_cache_count--; + spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); + } + + assert(ch->per_thread_cache_count == 0); +} + +static void +spdk_bdev_init_complete(int rc) +{ + spdk_bdev_init_cb cb_fn = g_init_cb_fn; + void *cb_arg = g_init_cb_arg; + struct spdk_bdev_module *m; + + g_bdev_mgr.init_complete = true; + g_init_cb_fn = NULL; + g_init_cb_arg = NULL; + + /* + * For modules that need to know when subsystem init is complete, + * inform them now. + */ + if (rc == 0) { + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->init_complete) { + m->init_complete(); + } + } + } + + cb_fn(cb_arg, rc); +} + +static void +spdk_bdev_module_action_complete(void) +{ + struct spdk_bdev_module *m; + + /* + * Don't finish bdev subsystem initialization if + * module pre-initialization is still in progress, or + * the subsystem been already initialized. + */ + if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { + return; + } + + /* + * Check all bdev modules for inits/examinations in progress. If any + * exist, return immediately since we cannot finish bdev subsystem + * initialization until all are completed. + */ + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->internal.action_in_progress > 0) { + return; + } + } + + /* + * Modules already finished initialization - now that all + * the bdev modules have finished their asynchronous I/O + * processing, the entire bdev layer can be marked as complete. + */ + spdk_bdev_init_complete(0); +} + +static void +spdk_bdev_module_action_done(struct spdk_bdev_module *module) +{ + assert(module->internal.action_in_progress > 0); + module->internal.action_in_progress--; + spdk_bdev_module_action_complete(); +} + +void +spdk_bdev_module_init_done(struct spdk_bdev_module *module) +{ + spdk_bdev_module_action_done(module); +} + +void +spdk_bdev_module_examine_done(struct spdk_bdev_module *module) +{ + spdk_bdev_module_action_done(module); +} + +/** The last initialized bdev module */ +static struct spdk_bdev_module *g_resume_bdev_module = NULL; + +static int +spdk_bdev_modules_init(void) +{ + struct spdk_bdev_module *module; + int rc = 0; + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + g_resume_bdev_module = module; + rc = module->module_init(); + if (rc != 0) { + return rc; + } + } + + g_resume_bdev_module = NULL; + return 0; +} + + +static void +spdk_bdev_init_failed_complete(void *cb_arg) +{ + spdk_bdev_init_complete(-1); +} + +static void +spdk_bdev_init_failed(void *cb_arg) +{ + spdk_bdev_finish(spdk_bdev_init_failed_complete, NULL); +} + +void +spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) +{ + struct spdk_conf_section *sp; + struct spdk_bdev_opts bdev_opts; + int32_t bdev_io_pool_size, bdev_io_cache_size; + int cache_size; + int rc = 0; + char mempool_name[32]; + + assert(cb_fn != NULL); + + sp = spdk_conf_find_section(NULL, "Bdev"); + if (sp != NULL) { + spdk_bdev_get_opts(&bdev_opts); + + bdev_io_pool_size = spdk_conf_section_get_intval(sp, "BdevIoPoolSize"); + if (bdev_io_pool_size >= 0) { + bdev_opts.bdev_io_pool_size = bdev_io_pool_size; + } + + bdev_io_cache_size = spdk_conf_section_get_intval(sp, "BdevIoCacheSize"); + if (bdev_io_cache_size >= 0) { + bdev_opts.bdev_io_cache_size = bdev_io_cache_size; + } + + if (spdk_bdev_set_opts(&bdev_opts)) { + spdk_bdev_init_complete(-1); + return; + } + + assert(memcmp(&bdev_opts, &g_bdev_opts, sizeof(bdev_opts)) == 0); + } + + g_init_cb_fn = cb_fn; + g_init_cb_arg = cb_arg; + + snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); + + g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, + g_bdev_opts.bdev_io_pool_size, + sizeof(struct spdk_bdev_io) + + spdk_bdev_module_get_max_ctx_size(), + 0, + SPDK_ENV_SOCKET_ID_ANY); + + if (g_bdev_mgr.bdev_io_pool == NULL) { + SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); + spdk_bdev_init_complete(-1); + return; + } + + /** + * Ensure no more than half of the total buffers end up local caches, by + * using spdk_thread_get_count() to determine how many local caches we need + * to account for. + */ + cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_thread_get_count()); + snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); + + g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, + BUF_SMALL_POOL_SIZE, + SPDK_BDEV_SMALL_BUF_MAX_SIZE + 512, + cache_size, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_bdev_mgr.buf_small_pool) { + SPDK_ERRLOG("create rbuf small pool failed\n"); + spdk_bdev_init_complete(-1); + return; + } + + cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_thread_get_count()); + snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); + + g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, + BUF_LARGE_POOL_SIZE, + SPDK_BDEV_LARGE_BUF_MAX_SIZE + 512, + cache_size, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_bdev_mgr.buf_large_pool) { + SPDK_ERRLOG("create rbuf large pool failed\n"); + spdk_bdev_init_complete(-1); + return; + } + + g_bdev_mgr.zero_buffer = spdk_dma_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, + NULL); + if (!g_bdev_mgr.zero_buffer) { + SPDK_ERRLOG("create bdev zero buffer failed\n"); + spdk_bdev_init_complete(-1); + return; + } + +#ifdef SPDK_CONFIG_VTUNE + g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); +#endif + + spdk_io_device_register(&g_bdev_mgr, spdk_bdev_mgmt_channel_create, + spdk_bdev_mgmt_channel_destroy, + sizeof(struct spdk_bdev_mgmt_channel), + "bdev_mgr"); + + rc = spdk_bdev_modules_init(); + g_bdev_mgr.module_init_complete = true; + if (rc != 0) { + SPDK_ERRLOG("bdev modules init failed\n"); + spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_init_failed, NULL); + return; + } + + spdk_bdev_module_action_complete(); +} + +static void +spdk_bdev_mgr_unregister_cb(void *io_device) +{ + spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; + + if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { + SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.bdev_io_pool), + g_bdev_opts.bdev_io_pool_size); + } + + if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != BUF_SMALL_POOL_SIZE) { + SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.buf_small_pool), + BUF_SMALL_POOL_SIZE); + assert(false); + } + + if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != BUF_LARGE_POOL_SIZE) { + SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", + spdk_mempool_count(g_bdev_mgr.buf_large_pool), + BUF_LARGE_POOL_SIZE); + assert(false); + } + + spdk_mempool_free(g_bdev_mgr.bdev_io_pool); + spdk_mempool_free(g_bdev_mgr.buf_small_pool); + spdk_mempool_free(g_bdev_mgr.buf_large_pool); + spdk_dma_free(g_bdev_mgr.zero_buffer); + + cb_fn(g_fini_cb_arg); + g_fini_cb_fn = NULL; + g_fini_cb_arg = NULL; + g_bdev_mgr.init_complete = false; + g_bdev_mgr.module_init_complete = false; +} + +static void +spdk_bdev_module_finish_iter(void *arg) +{ + struct spdk_bdev_module *bdev_module; + + /* Start iterating from the last touched module */ + if (!g_resume_bdev_module) { + bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); + } else { + bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, + internal.tailq); + } + + while (bdev_module) { + if (bdev_module->async_fini) { + /* Save our place so we can resume later. We must + * save the variable here, before calling module_fini() + * below, because in some cases the module may immediately + * call spdk_bdev_module_finish_done() and re-enter + * this function to continue iterating. */ + g_resume_bdev_module = bdev_module; + } + + if (bdev_module->module_fini) { + bdev_module->module_fini(); + } + + if (bdev_module->async_fini) { + return; + } + + bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, + internal.tailq); + } + + g_resume_bdev_module = NULL; + spdk_io_device_unregister(&g_bdev_mgr, spdk_bdev_mgr_unregister_cb); +} + +void +spdk_bdev_module_finish_done(void) +{ + if (spdk_get_thread() != g_fini_thread) { + spdk_thread_send_msg(g_fini_thread, spdk_bdev_module_finish_iter, NULL); + } else { + spdk_bdev_module_finish_iter(NULL); + } +} + +static void +_spdk_bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) +{ + struct spdk_bdev *bdev = cb_arg; + + if (bdeverrno && bdev) { + SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", + bdev->name); + + /* + * Since the call to spdk_bdev_unregister() failed, we have no way to free this + * bdev; try to continue by manually removing this bdev from the list and continue + * with the next bdev in the list. + */ + TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); + } + + if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Done unregistering bdevs\n"); + /* + * Bdev module finish need to be deffered as we might be in the middle of some context + * (like bdev part free) that will use this bdev (or private bdev driver ctx data) + * after returning. + */ + spdk_thread_send_msg(spdk_get_thread(), spdk_bdev_module_finish_iter, NULL); + return; + } + + /* + * Unregister the last bdev in the list. The last bdev in the list should be a bdev + * that has no bdevs that depend on it. + */ + bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Unregistering bdev '%s'\n", bdev->name); + spdk_bdev_unregister(bdev, _spdk_bdev_finish_unregister_bdevs_iter, bdev); +} + +void +spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev_module *m; + + assert(cb_fn != NULL); + + g_fini_thread = spdk_get_thread(); + + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (m->fini_start) { + m->fini_start(); + } + } + + _spdk_bdev_finish_unregister_bdevs_iter(NULL, 0); +} + +static struct spdk_bdev_io * +spdk_bdev_get_io(struct spdk_bdev_channel *channel) +{ + struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; + struct spdk_bdev_io *bdev_io; + + if (ch->per_thread_cache_count > 0) { + bdev_io = STAILQ_FIRST(&ch->per_thread_cache); + STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); + ch->per_thread_cache_count--; + } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { + /* + * Don't try to look for bdev_ios in the global pool if there are + * waiters on bdev_ios - we don't want this caller to jump the line. + */ + bdev_io = NULL; + } else { + bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); + } + + return bdev_io; +} + +void +spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_mgmt_channel *ch = bdev_io->internal.ch->shared_resource->mgmt_ch; + + assert(bdev_io != NULL); + assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); + + if (bdev_io->internal.buf != NULL) { + spdk_bdev_io_put_buf(bdev_io); + } + + if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { + ch->per_thread_cache_count++; + STAILQ_INSERT_TAIL(&ch->per_thread_cache, bdev_io, internal.buf_link); + while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { + struct spdk_bdev_io_wait_entry *entry; + + entry = TAILQ_FIRST(&ch->io_wait_queue); + TAILQ_REMOVE(&ch->io_wait_queue, entry, link); + entry->cb_fn(entry->cb_arg); + } + } else { + /* We should never have a full cache with entries on the io wait queue. */ + assert(TAILQ_EMPTY(&ch->io_wait_queue)); + spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); + } +} + +static bool +_spdk_bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) +{ + assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); + + switch (limit) { + case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: + return true; + case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: + return false; + case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: + default: + return false; + } +} + +static bool +_spdk_bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return true; + default: + return false; + } +} + +static uint64_t +_spdk_bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_NVME_IO: + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return bdev_io->u.nvme_passthru.nbytes; + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return bdev_io->u.bdev.num_blocks * bdev->blocklen; + default: + return 0; + } +} + +static void +_spdk_bdev_qos_update_per_io(struct spdk_bdev_qos *qos, uint64_t io_size_in_byte) +{ + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + continue; + } + + switch (i) { + case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: + qos->rate_limits[i].remaining_this_timeslice--; + break; + case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: + qos->rate_limits[i].remaining_this_timeslice -= io_size_in_byte; + break; + case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: + default: + break; + } + } +} + +static void +_spdk_bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) +{ + struct spdk_bdev_io *bdev_io = NULL; + struct spdk_bdev *bdev = ch->bdev; + struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; + int i; + bool to_limit_io; + uint64_t io_size_in_byte; + + while (!TAILQ_EMPTY(&qos->queued)) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (qos->rate_limits[i].max_per_timeslice > 0 && + (qos->rate_limits[i].remaining_this_timeslice <= 0)) { + return; + } + } + + bdev_io = TAILQ_FIRST(&qos->queued); + TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); + ch->io_outstanding++; + shared_resource->io_outstanding++; + to_limit_io = _spdk_bdev_qos_io_to_limit(bdev_io); + if (to_limit_io == true) { + io_size_in_byte = _spdk_bdev_get_io_size_in_byte(bdev_io); + _spdk_bdev_qos_update_per_io(qos, io_size_in_byte); + } + bdev->fn_table->submit_request(ch->channel, bdev_io); + } +} + +static void +_spdk_bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) +{ + int rc; + + bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; + bdev_io->internal.waitq_entry.cb_fn = cb_fn; + bdev_io->internal.waitq_entry.cb_arg = bdev_io; + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), + &bdev_io->internal.waitq_entry); + if (rc != 0) { + SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static bool +_spdk_bdev_io_type_can_split(uint8_t type) +{ + assert(type != SPDK_BDEV_IO_TYPE_INVALID); + assert(type < SPDK_BDEV_NUM_IO_TYPES); + + /* Only split READ and WRITE I/O. Theoretically other types of I/O like + * UNMAP could be split, but these types of I/O are typically much larger + * in size (sometimes the size of the entire block device), and the bdev + * module can more efficiently split these types of I/O. Plus those types + * of I/O do not have a payload, which makes the splitting process simpler. + */ + if (type == SPDK_BDEV_IO_TYPE_READ || type == SPDK_BDEV_IO_TYPE_WRITE) { + return true; + } else { + return false; + } +} + +static bool +_spdk_bdev_io_should_split(struct spdk_bdev_io *bdev_io) +{ + uint64_t start_stripe, end_stripe; + uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; + + if (io_boundary == 0) { + return false; + } + + if (!_spdk_bdev_io_type_can_split(bdev_io->type)) { + return false; + } + + start_stripe = bdev_io->u.bdev.offset_blocks; + end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; + /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ + if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { + start_stripe >>= spdk_u32log2(io_boundary); + end_stripe >>= spdk_u32log2(io_boundary); + } else { + start_stripe /= io_boundary; + end_stripe /= io_boundary; + } + return (start_stripe != end_stripe); +} + +static uint32_t +_to_next_boundary(uint64_t offset, uint32_t boundary) +{ + return (boundary - (offset % boundary)); +} + +static void +_spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); + +static void +_spdk_bdev_io_split_with_payload(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + uint64_t current_offset, remaining; + uint32_t blocklen, to_next_boundary, to_next_boundary_bytes; + struct iovec *parent_iov, *iov; + uint64_t parent_iov_offset, iov_len; + uint32_t parent_iovpos, parent_iovcnt, child_iovcnt, iovcnt; + int rc; + + remaining = bdev_io->u.bdev.split_remaining_num_blocks; + current_offset = bdev_io->u.bdev.split_current_offset_blocks; + blocklen = bdev_io->bdev->blocklen; + parent_iov_offset = (current_offset - bdev_io->u.bdev.offset_blocks) * blocklen; + parent_iovcnt = bdev_io->u.bdev.iovcnt; + + for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { + parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; + if (parent_iov_offset < parent_iov->iov_len) { + break; + } + parent_iov_offset -= parent_iov->iov_len; + } + + child_iovcnt = 0; + while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + to_next_boundary = _to_next_boundary(current_offset, bdev_io->bdev->optimal_io_boundary); + to_next_boundary = spdk_min(remaining, to_next_boundary); + to_next_boundary_bytes = to_next_boundary * blocklen; + iov = &bdev_io->child_iov[child_iovcnt]; + iovcnt = 0; + while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && + child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { + parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; + iov_len = spdk_min(to_next_boundary_bytes, parent_iov->iov_len - parent_iov_offset); + to_next_boundary_bytes -= iov_len; + + bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; + bdev_io->child_iov[child_iovcnt].iov_len = iov_len; + + if (iov_len < parent_iov->iov_len - parent_iov_offset) { + parent_iov_offset += iov_len; + } else { + parent_iovpos++; + parent_iov_offset = 0; + } + child_iovcnt++; + iovcnt++; + } + + if (to_next_boundary_bytes > 0) { + /* We had to stop this child I/O early because we ran out of + * child_iov space. Make sure the iovs collected are valid and + * then adjust to_next_boundary before starting the child I/O. + */ + if ((to_next_boundary_bytes % blocklen) != 0) { + SPDK_ERRLOG("Remaining %" PRIu32 " is not multiple of block size %" PRIu32 "\n", + to_next_boundary_bytes, blocklen); + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + if (bdev_io->u.bdev.split_outstanding == 0) { + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + return; + } + to_next_boundary -= to_next_boundary_bytes / blocklen; + } + + bdev_io->u.bdev.split_outstanding++; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + rc = spdk_bdev_readv_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, current_offset, to_next_boundary, + _spdk_bdev_io_split_done, bdev_io); + } else { + rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + iov, iovcnt, current_offset, to_next_boundary, + _spdk_bdev_io_split_done, bdev_io); + } + + if (rc == 0) { + current_offset += to_next_boundary; + remaining -= to_next_boundary; + bdev_io->u.bdev.split_current_offset_blocks = current_offset; + bdev_io->u.bdev.split_remaining_num_blocks = remaining; + } else { + bdev_io->u.bdev.split_outstanding--; + if (rc == -ENOMEM) { + if (bdev_io->u.bdev.split_outstanding == 0) { + /* No I/O is outstanding. Hence we should wait here. */ + _spdk_bdev_queue_io_wait_with_cb(bdev_io, + _spdk_bdev_io_split_with_payload); + } + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + if (bdev_io->u.bdev.split_outstanding == 0) { + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } + } + + return; + } + } +} + +static void +_spdk_bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + } + parent_io->u.bdev.split_outstanding--; + if (parent_io->u.bdev.split_outstanding != 0) { + return; + } + + /* + * Parent I/O finishes when all blocks are consumed or there is any failure of + * child I/O and no outstanding child I/O. + */ + if (parent_io->u.bdev.split_remaining_num_blocks == 0 || + parent_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS) { + parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, + parent_io->internal.caller_ctx); + return; + } + + /* + * Continue with the splitting process. This function will complete the parent I/O if the + * splitting is done. + */ + _spdk_bdev_io_split_with_payload(parent_io); +} + +static void +_spdk_bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + assert(_spdk_bdev_io_type_can_split(bdev_io->type)); + + bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; + bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; + bdev_io->u.bdev.split_outstanding = 0; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + + _spdk_bdev_io_split_with_payload(bdev_io); +} + +static void +_spdk_bdev_io_submit(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + struct spdk_io_channel *ch = bdev_ch->channel; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + uint64_t tsc; + + tsc = spdk_get_ticks(); + bdev_io->internal.submit_tsc = tsc; + spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, bdev_io->type); + bdev_ch->io_outstanding++; + shared_resource->io_outstanding++; + bdev_io->internal.in_submit_request = true; + if (spdk_likely(bdev_ch->flags == 0)) { + if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { + bdev->fn_table->submit_request(ch, bdev_io); + } else { + bdev_ch->io_outstanding--; + shared_resource->io_outstanding--; + TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); + } + } else if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { + bdev_ch->io_outstanding--; + shared_resource->io_outstanding--; + TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); + _spdk_bdev_qos_io_submit(bdev_ch, bdev->internal.qos); + } else { + SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + bdev_io->internal.in_submit_request = false; +} + +static void +spdk_bdev_io_submit(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_thread *thread = spdk_io_channel_get_thread(bdev_io->internal.ch->channel); + + assert(thread != NULL); + assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); + + if (bdev->split_on_optimal_io_boundary && _spdk_bdev_io_should_split(bdev_io)) { + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + spdk_bdev_io_get_buf(bdev_io, _spdk_bdev_io_split, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + } else { + _spdk_bdev_io_split(NULL, bdev_io); + } + return; + } + + if (bdev_io->internal.ch->flags & BDEV_CH_QOS_ENABLED) { + if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { + _spdk_bdev_io_submit(bdev_io); + } else { + bdev_io->internal.io_submit_ch = bdev_io->internal.ch; + bdev_io->internal.ch = bdev->internal.qos->ch; + spdk_thread_send_msg(bdev->internal.qos->thread, _spdk_bdev_io_submit, bdev_io); + } + } else { + _spdk_bdev_io_submit(bdev_io); + } +} + +static void +spdk_bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + struct spdk_io_channel *ch = bdev_ch->channel; + + assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); + + bdev_io->internal.in_submit_request = true; + bdev->fn_table->submit_request(ch, bdev_io); + bdev_io->internal.in_submit_request = false; +} + +static void +spdk_bdev_io_init(struct spdk_bdev_io *bdev_io, + struct spdk_bdev *bdev, void *cb_arg, + spdk_bdev_io_completion_cb cb) +{ + bdev_io->bdev = bdev; + bdev_io->internal.caller_ctx = cb_arg; + bdev_io->internal.cb = cb; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + bdev_io->internal.in_submit_request = false; + bdev_io->internal.buf = NULL; + bdev_io->internal.io_submit_ch = NULL; +} + +static bool +_spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +{ + return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); +} + +bool +spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) +{ + bool supported; + + supported = _spdk_bdev_io_type_supported(bdev, io_type); + + if (!supported) { + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* The bdev layer will emulate write zeroes as long as write is supported. */ + supported = _spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); + break; + default: + break; + } + } + + return supported; +} + +int +spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + if (bdev->fn_table->dump_info_json) { + return bdev->fn_table->dump_info_json(bdev->ctxt, w); + } + + return 0; +} + +static void +spdk_bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) +{ + uint32_t max_per_timeslice = 0; + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + qos->rate_limits[i].max_per_timeslice = 0; + continue; + } + + max_per_timeslice = qos->rate_limits[i].limit * + SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; + + qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, + qos->rate_limits[i].min_per_timeslice); + + qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; + } +} + +static int +spdk_bdev_channel_poll_qos(void *arg) +{ + struct spdk_bdev_qos *qos = arg; + uint64_t now = spdk_get_ticks(); + int i; + + if (now < (qos->last_timeslice + qos->timeslice_size)) { + /* We received our callback earlier than expected - return + * immediately and wait to do accounting until at least one + * timeslice has actually expired. This should never happen + * with a well-behaved timer implementation. + */ + return 0; + } + + /* Reset for next round of rate limiting */ + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + /* We may have allowed the IOs or bytes to slightly overrun in the last + * timeslice. remaining_this_timeslice is signed, so if it's negative + * here, we'll account for the overrun so that the next timeslice will + * be appropriately reduced. + */ + if (qos->rate_limits[i].remaining_this_timeslice > 0) { + qos->rate_limits[i].remaining_this_timeslice = 0; + } + } + + while (now >= (qos->last_timeslice + qos->timeslice_size)) { + qos->last_timeslice += qos->timeslice_size; + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + qos->rate_limits[i].remaining_this_timeslice += + qos->rate_limits[i].max_per_timeslice; + } + } + + _spdk_bdev_qos_io_submit(qos->ch, qos); + + return -1; +} + +static void +_spdk_bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_shared_resource *shared_resource; + + if (!ch) { + return; + } + + if (ch->channel) { + spdk_put_io_channel(ch->channel); + } + + assert(ch->io_outstanding == 0); + + shared_resource = ch->shared_resource; + if (shared_resource) { + assert(ch->io_outstanding == 0); + assert(shared_resource->ref > 0); + shared_resource->ref--; + if (shared_resource->ref == 0) { + assert(shared_resource->io_outstanding == 0); + TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); + spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); + free(shared_resource); + } + } +} + +/* Caller must hold bdev->internal.mutex. */ +static void +_spdk_bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_qos *qos = bdev->internal.qos; + int i; + + /* Rate limiting on this bdev enabled */ + if (qos) { + if (qos->ch == NULL) { + struct spdk_io_channel *io_ch; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, + bdev->name, spdk_get_thread()); + + /* No qos channel has been selected, so set one up */ + + /* Take another reference to ch */ + io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); + qos->ch = ch; + + qos->thread = spdk_io_channel_get_thread(io_ch); + + TAILQ_INIT(&qos->queued); + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { + qos->rate_limits[i].min_per_timeslice = + SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; + } else { + qos->rate_limits[i].min_per_timeslice = + SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; + } + + if (qos->rate_limits[i].limit == 0) { + qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + } + } + spdk_bdev_qos_update_max_quota_per_timeslice(qos); + qos->timeslice_size = + SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; + qos->last_timeslice = spdk_get_ticks(); + qos->poller = spdk_poller_register(spdk_bdev_channel_poll_qos, + qos, + SPDK_BDEV_QOS_TIMESLICE_IN_USEC); + } + + ch->flags |= BDEV_CH_QOS_ENABLED; + } +} + +static int +spdk_bdev_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct spdk_bdev_channel *ch = ctx_buf; + struct spdk_io_channel *mgmt_io_ch; + struct spdk_bdev_mgmt_channel *mgmt_ch; + struct spdk_bdev_shared_resource *shared_resource; + + ch->bdev = bdev; + ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); + if (!ch->channel) { + return -1; + } + + mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); + if (!mgmt_io_ch) { + return -1; + } + + mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); + TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { + if (shared_resource->shared_ch == ch->channel) { + spdk_put_io_channel(mgmt_io_ch); + shared_resource->ref++; + break; + } + } + + if (shared_resource == NULL) { + shared_resource = calloc(1, sizeof(*shared_resource)); + if (shared_resource == NULL) { + spdk_put_io_channel(mgmt_io_ch); + return -1; + } + + shared_resource->mgmt_ch = mgmt_ch; + shared_resource->io_outstanding = 0; + TAILQ_INIT(&shared_resource->nomem_io); + shared_resource->nomem_threshold = 0; + shared_resource->shared_ch = ch->channel; + shared_resource->ref = 1; + TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); + } + + memset(&ch->stat, 0, sizeof(ch->stat)); + ch->stat.ticks_rate = spdk_get_ticks_hz(); + ch->io_outstanding = 0; + TAILQ_INIT(&ch->queued_resets); + ch->flags = 0; + ch->shared_resource = shared_resource; + +#ifdef SPDK_CONFIG_VTUNE + { + char *name; + __itt_init_ittlib(NULL, 0); + name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); + if (!name) { + _spdk_bdev_channel_destroy_resource(ch); + return -1; + } + ch->handle = __itt_string_handle_create(name); + free(name); + ch->start_tsc = spdk_get_ticks(); + ch->interval_tsc = spdk_get_ticks_hz() / 100; + memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); + } +#endif + + pthread_mutex_lock(&bdev->internal.mutex); + _spdk_bdev_enable_qos(bdev, ch); + pthread_mutex_unlock(&bdev->internal.mutex); + + return 0; +} + +/* + * Abort I/O that are waiting on a data buffer. These types of I/O are + * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. + */ +static void +_spdk_bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) +{ + bdev_io_stailq_t tmp; + struct spdk_bdev_io *bdev_io; + + STAILQ_INIT(&tmp); + + while (!STAILQ_EMPTY(queue)) { + bdev_io = STAILQ_FIRST(queue); + STAILQ_REMOVE_HEAD(queue, internal.buf_link); + if (bdev_io->internal.ch == ch) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else { + STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); + } + } + + STAILQ_SWAP(&tmp, queue, spdk_bdev_io); +} + +/* + * Abort I/O that are queued waiting for submission. These types of I/O are + * linked using the spdk_bdev_io link TAILQ_ENTRY. + */ +static void +_spdk_bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) +{ + struct spdk_bdev_io *bdev_io, *tmp; + + TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { + if (bdev_io->internal.ch == ch) { + TAILQ_REMOVE(queue, bdev_io, internal.link); + /* + * spdk_bdev_io_complete() assumes that the completed I/O had + * been submitted to the bdev module. Since in this case it + * hadn't, bump io_outstanding to account for the decrement + * that spdk_bdev_io_complete() will do. + */ + if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { + ch->io_outstanding++; + ch->shared_resource->io_outstanding++; + } + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +spdk_bdev_qos_channel_destroy(void *cb_arg) +{ + struct spdk_bdev_qos *qos = cb_arg; + + spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); + spdk_poller_unregister(&qos->poller); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Free QoS %p.\n", qos); + + free(qos); +} + +static int +spdk_bdev_qos_destroy(struct spdk_bdev *bdev) +{ + int i; + + /* + * Cleanly shutting down the QoS poller is tricky, because + * during the asynchronous operation the user could open + * a new descriptor and create a new channel, spawning + * a new QoS poller. + * + * The strategy is to create a new QoS structure here and swap it + * in. The shutdown path then continues to refer to the old one + * until it completes and then releases it. + */ + struct spdk_bdev_qos *new_qos, *old_qos; + + old_qos = bdev->internal.qos; + + new_qos = calloc(1, sizeof(*new_qos)); + if (!new_qos) { + SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); + return -ENOMEM; + } + + /* Copy the old QoS data into the newly allocated structure */ + memcpy(new_qos, old_qos, sizeof(*new_qos)); + + /* Zero out the key parts of the QoS structure */ + new_qos->ch = NULL; + new_qos->thread = NULL; + new_qos->poller = NULL; + TAILQ_INIT(&new_qos->queued); + /* + * The limit member of spdk_bdev_qos_limit structure is not zeroed. + * It will be used later for the new QoS structure. + */ + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + new_qos->rate_limits[i].remaining_this_timeslice = 0; + new_qos->rate_limits[i].min_per_timeslice = 0; + new_qos->rate_limits[i].max_per_timeslice = 0; + } + + bdev->internal.qos = new_qos; + + if (old_qos->thread == NULL) { + free(old_qos); + } else { + spdk_thread_send_msg(old_qos->thread, spdk_bdev_qos_channel_destroy, + old_qos); + } + + /* It is safe to continue with destroying the bdev even though the QoS channel hasn't + * been destroyed yet. The destruction path will end up waiting for the final + * channel to be put before it releases resources. */ + + return 0; +} + +static void +_spdk_bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) +{ + total->bytes_read += add->bytes_read; + total->num_read_ops += add->num_read_ops; + total->bytes_written += add->bytes_written; + total->num_write_ops += add->num_write_ops; + total->read_latency_ticks += add->read_latency_ticks; + total->write_latency_ticks += add->write_latency_ticks; +} + +static void +spdk_bdev_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_channel *ch = ctx_buf; + struct spdk_bdev_mgmt_channel *mgmt_ch; + struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, + spdk_get_thread()); + + /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ + pthread_mutex_lock(&ch->bdev->internal.mutex); + _spdk_bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); + pthread_mutex_unlock(&ch->bdev->internal.mutex); + + mgmt_ch = shared_resource->mgmt_ch; + + _spdk_bdev_abort_queued_io(&ch->queued_resets, ch); + _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, ch); + _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_small, ch); + _spdk_bdev_abort_buf_io(&mgmt_ch->need_buf_large, ch); + + _spdk_bdev_channel_destroy_resource(ch); +} + +int +spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) +{ + struct spdk_bdev_alias *tmp; + + if (alias == NULL) { + SPDK_ERRLOG("Empty alias passed\n"); + return -EINVAL; + } + + if (spdk_bdev_get_by_name(alias)) { + SPDK_ERRLOG("Bdev name/alias: %s already exists\n", alias); + return -EEXIST; + } + + tmp = calloc(1, sizeof(*tmp)); + if (tmp == NULL) { + SPDK_ERRLOG("Unable to allocate alias\n"); + return -ENOMEM; + } + + tmp->alias = strdup(alias); + if (tmp->alias == NULL) { + free(tmp); + SPDK_ERRLOG("Unable to allocate alias\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); + + return 0; +} + +int +spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) +{ + struct spdk_bdev_alias *tmp; + + TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { + if (strcmp(alias, tmp->alias) == 0) { + TAILQ_REMOVE(&bdev->aliases, tmp, tailq); + free(tmp->alias); + free(tmp); + return 0; + } + } + + SPDK_INFOLOG(SPDK_LOG_BDEV, "Alias %s does not exists\n", alias); + + return -ENOENT; +} + +void +spdk_bdev_alias_del_all(struct spdk_bdev *bdev) +{ + struct spdk_bdev_alias *p, *tmp; + + TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { + TAILQ_REMOVE(&bdev->aliases, p, tailq); + free(p->alias); + free(p); + } +} + +struct spdk_io_channel * +spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) +{ + return spdk_get_io_channel(__bdev_to_io_dev(desc->bdev)); +} + +const char * +spdk_bdev_get_name(const struct spdk_bdev *bdev) +{ + return bdev->name; +} + +const char * +spdk_bdev_get_product_name(const struct spdk_bdev *bdev) +{ + return bdev->product_name; +} + +const struct spdk_bdev_aliases_list * +spdk_bdev_get_aliases(const struct spdk_bdev *bdev) +{ + return &bdev->aliases; +} + +uint32_t +spdk_bdev_get_block_size(const struct spdk_bdev *bdev) +{ + return bdev->blocklen; +} + +uint64_t +spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) +{ + return bdev->blockcnt; +} + +const char * +spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) +{ + return qos_rpc_type[type]; +} + +void +spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +{ + int i; + + memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.qos) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (bdev->internal.qos->rate_limits[i].limit != + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + limits[i] = bdev->internal.qos->rate_limits[i].limit; + if (_spdk_bdev_qos_is_iops_rate_limit(i) == false) { + /* Change from Byte to Megabyte which is user visible. */ + limits[i] = limits[i] / 1024 / 1024; + } + } + } + } + pthread_mutex_unlock(&bdev->internal.mutex); +} + +size_t +spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) +{ + /* TODO: push this logic down to the bdev modules */ + if (bdev->need_aligned_buffer) { + return bdev->blocklen; + } + + return 1; +} + +uint32_t +spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) +{ + return bdev->optimal_io_boundary; +} + +bool +spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) +{ + return bdev->write_cache; +} + +const struct spdk_uuid * +spdk_bdev_get_uuid(const struct spdk_bdev *bdev) +{ + return &bdev->uuid; +} + +uint64_t +spdk_bdev_get_qd(const struct spdk_bdev *bdev) +{ + return bdev->internal.measured_queue_depth; +} + +uint64_t +spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) +{ + return bdev->internal.period; +} + +uint64_t +spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) +{ + return bdev->internal.weighted_io_time; +} + +uint64_t +spdk_bdev_get_io_time(const struct spdk_bdev *bdev) +{ + return bdev->internal.io_time; +} + +static void +_calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); + + bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; + + if (bdev->internal.measured_queue_depth) { + bdev->internal.io_time += bdev->internal.period; + bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; + } +} + +static void +_calculate_measured_qd(struct spdk_io_channel_iter *i) +{ + struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); + + bdev->internal.temporary_queue_depth += ch->io_outstanding; + spdk_for_each_channel_continue(i, 0); +} + +static int +spdk_bdev_calculate_measured_queue_depth(void *ctx) +{ + struct spdk_bdev *bdev = ctx; + bdev->internal.temporary_queue_depth = 0; + spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, + _calculate_measured_qd_cpl); + return 0; +} + +void +spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) +{ + bdev->internal.period = period; + + if (bdev->internal.qd_poller != NULL) { + spdk_poller_unregister(&bdev->internal.qd_poller); + bdev->internal.measured_queue_depth = UINT64_MAX; + } + + if (period != 0) { + bdev->internal.qd_poller = spdk_poller_register(spdk_bdev_calculate_measured_queue_depth, bdev, + period); + } +} + +int +spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) +{ + int ret; + + pthread_mutex_lock(&bdev->internal.mutex); + + /* bdev has open descriptors */ + if (!TAILQ_EMPTY(&bdev->internal.open_descs) && + bdev->blockcnt > size) { + ret = -EBUSY; + } else { + bdev->blockcnt = size; + ret = 0; + } + + pthread_mutex_unlock(&bdev->internal.mutex); + + return ret; +} + +/* + * Convert I/O offset and length from bytes to blocks. + * + * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. + */ +static uint64_t +spdk_bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, + uint64_t num_bytes, uint64_t *num_blocks) +{ + uint32_t block_size = bdev->blocklen; + + *offset_blocks = offset_bytes / block_size; + *num_blocks = num_bytes / block_size; + + return (offset_bytes % block_size) | (num_bytes % block_size); +} + +static bool +spdk_bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) +{ + /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there + * has been an overflow and hence the offset has been wrapped around */ + if (offset_blocks + num_blocks < offset_blocks) { + return false; + } + + /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ + if (offset_blocks + num_blocks > bdev->blockcnt) { + return false; + } + + return true; +} + +int +spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_READ; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +} + +int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_READ; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + void *buf, uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = buf; + bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; + bdev_io->u.bdev.iovcnt = 1; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset, uint64_t len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; + bdev_io->u.bdev.iovs = iov; + bdev_io->u.bdev.iovcnt = iovcnt; + bdev_io->u.bdev.num_blocks = num_blocks; + bdev_io->u.bdev.offset_blocks = offset_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, len, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { + spdk_bdev_io_submit(bdev_io); + return 0; + } else if (_spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { + assert(spdk_bdev_get_block_size(bdev) <= ZERO_BUFFER_SIZE); + bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; + bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; + _spdk_bdev_write_zero_buffer_next(bdev_io); + return 0; + } else { + spdk_bdev_free_io(bdev_io); + return -ENOTSUP; + } +} + +int +spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, nbytes, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + if (num_blocks == 0) { + SPDK_ERRLOG("Can't unmap 0 bytes\n"); + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; + + bdev_io->u.bdev.iovs = &bdev_io->iov; + bdev_io->u.bdev.iovs[0].iov_base = NULL; + bdev_io->u.bdev.iovs[0].iov_len = 0; + bdev_io->u.bdev.iovcnt = 1; + + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset, uint64_t length, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + uint64_t offset_blocks, num_blocks; + + if (spdk_bdev_bytes_to_blocks(desc->bdev, offset, &offset_blocks, length, &num_blocks) != 0) { + return -EINVAL; + } + + return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); +} + +int +spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + uint64_t offset_blocks, uint64_t num_blocks, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + if (!spdk_bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { + return -EINVAL; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; + bdev_io->u.bdev.iovs = NULL; + bdev_io->u.bdev.iovcnt = 0; + bdev_io->u.bdev.offset_blocks = offset_blocks; + bdev_io->u.bdev.num_blocks = num_blocks; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +static void +_spdk_bdev_reset_dev(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); + struct spdk_bdev_io *bdev_io; + + bdev_io = TAILQ_FIRST(&ch->queued_resets); + TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); + spdk_bdev_io_submit_reset(bdev_io); +} + +static void +_spdk_bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch; + struct spdk_bdev_channel *channel; + struct spdk_bdev_mgmt_channel *mgmt_channel; + struct spdk_bdev_shared_resource *shared_resource; + bdev_io_tailq_t tmp_queued; + + TAILQ_INIT(&tmp_queued); + + ch = spdk_io_channel_iter_get_channel(i); + channel = spdk_io_channel_get_ctx(ch); + shared_resource = channel->shared_resource; + mgmt_channel = shared_resource->mgmt_ch; + + channel->flags |= BDEV_CH_RESET_IN_PROGRESS; + + if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { + /* The QoS object is always valid and readable while + * the channel flag is set, so the lock here should not + * be necessary. We're not in the fast path though, so + * just take it anyway. */ + pthread_mutex_lock(&channel->bdev->internal.mutex); + if (channel->bdev->internal.qos->ch == channel) { + TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); + } + pthread_mutex_unlock(&channel->bdev->internal.mutex); + } + + _spdk_bdev_abort_queued_io(&shared_resource->nomem_io, channel); + _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_small, channel); + _spdk_bdev_abort_buf_io(&mgmt_channel->need_buf_large, channel); + _spdk_bdev_abort_queued_io(&tmp_queued, channel); + + spdk_for_each_channel_continue(i, 0); +} + +static void +_spdk_bdev_start_reset(void *ctx) +{ + struct spdk_bdev_channel *ch = ctx; + + spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), _spdk_bdev_reset_freeze_channel, + ch, _spdk_bdev_reset_dev); +} + +static void +_spdk_bdev_channel_start_reset(struct spdk_bdev_channel *ch) +{ + struct spdk_bdev *bdev = ch->bdev; + + assert(!TAILQ_EMPTY(&ch->queued_resets)); + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.reset_in_progress == NULL) { + bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); + /* + * Take a channel reference for the target bdev for the life of this + * reset. This guards against the channel getting destroyed while + * spdk_for_each_channel() calls related to this reset IO are in + * progress. We will release the reference when this reset is + * completed. + */ + bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); + _spdk_bdev_start_reset(ch); + } + pthread_mutex_unlock(&bdev->internal.mutex); +} + +int +spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; + bdev_io->u.reset.ch_ref = NULL; + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + pthread_mutex_lock(&bdev->internal.mutex); + TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); + pthread_mutex_unlock(&bdev->internal.mutex); + + _spdk_bdev_channel_start_reset(channel); + + return 0; +} + +void +spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, + struct spdk_bdev_io_stat *stat) +{ + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + *stat = channel->stat; +} + +static void +_spdk_bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); + + bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, + bdev_iostat_ctx->cb_arg, 0); + free(bdev_iostat_ctx); +} + +static void +_spdk_bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) +{ + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); + spdk_for_each_channel_continue(i, 0); +} + +void +spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, + spdk_bdev_get_device_stat_cb cb, void *cb_arg) +{ + struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; + + assert(bdev != NULL); + assert(stat != NULL); + assert(cb != NULL); + + bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); + if (bdev_iostat_ctx == NULL) { + SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); + cb(bdev, stat, cb_arg, -ENOMEM); + return; + } + + bdev_iostat_ctx->stat = stat; + bdev_iostat_ctx->cb = cb; + bdev_iostat_ctx->cb_arg = cb_arg; + + /* Start with the statistics from previously deleted channels. */ + pthread_mutex_lock(&bdev->internal.mutex); + _spdk_bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); + pthread_mutex_unlock(&bdev->internal.mutex); + + /* Then iterate and add the statistics from each existing channel. */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), + _spdk_bdev_get_each_channel_stat, + bdev_iostat_ctx, + _spdk_bdev_get_device_stat_done); +} + +int +spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + return -EBADF; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = NULL; + bdev_io->u.nvme_passthru.md_len = 0; + + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + /* + * Do not try to parse the NVMe command - we could maybe use bits in the opcode + * to easily determine if the command is a read or write, but for now just + * do not allow io_passthru with a read-only descriptor. + */ + return -EBADF; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = NULL; + bdev_io->u.nvme_passthru.md_len = 0; + + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, + const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, + spdk_bdev_io_completion_cb cb, void *cb_arg) +{ + struct spdk_bdev *bdev = desc->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + + if (!desc->write) { + /* + * Do not try to parse the NVMe command - we could maybe use bits in the opcode + * to easily determine if the command is a read or write, but for now just + * do not allow io_passthru with a read-only descriptor. + */ + return -EBADF; + } + + bdev_io = spdk_bdev_get_io(channel); + if (!bdev_io) { + return -ENOMEM; + } + + bdev_io->internal.ch = channel; + bdev_io->internal.desc = desc; + bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; + bdev_io->u.nvme_passthru.cmd = *cmd; + bdev_io->u.nvme_passthru.buf = buf; + bdev_io->u.nvme_passthru.nbytes = nbytes; + bdev_io->u.nvme_passthru.md_buf = md_buf; + bdev_io->u.nvme_passthru.md_len = md_len; + + spdk_bdev_io_init(bdev_io, bdev, cb_arg, cb); + + spdk_bdev_io_submit(bdev_io); + return 0; +} + +int +spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, + struct spdk_bdev_io_wait_entry *entry) +{ + struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); + struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; + + if (bdev != entry->bdev) { + SPDK_ERRLOG("bdevs do not match\n"); + return -EINVAL; + } + + if (mgmt_ch->per_thread_cache_count > 0) { + SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); + return -EINVAL; + } + + TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); + return 0; +} + +static void +_spdk_bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) +{ + struct spdk_bdev *bdev = bdev_ch->bdev; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + struct spdk_bdev_io *bdev_io; + + if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { + /* + * Allow some more I/O to complete before retrying the nomem_io queue. + * Some drivers (such as nvme) cannot immediately take a new I/O in + * the context of a completion, because the resources for the I/O are + * not released until control returns to the bdev poller. Also, we + * may require several small I/O to complete before a larger I/O + * (that requires splitting) can be submitted. + */ + return; + } + + while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { + bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); + TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); + bdev_io->internal.ch->io_outstanding++; + shared_resource->io_outstanding++; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; + bdev->fn_table->submit_request(bdev_io->internal.ch->channel, bdev_io); + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { + break; + } + } +} + +static inline void +_spdk_bdev_io_complete(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + uint64_t tsc; + + if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { + /* + * Send the completion to the thread that originally submitted the I/O, + * which may not be the current thread in the case of QoS. + */ + if (bdev_io->internal.io_submit_ch) { + bdev_io->internal.ch = bdev_io->internal.io_submit_ch; + bdev_io->internal.io_submit_ch = NULL; + } + + /* + * Defer completion to avoid potential infinite recursion if the + * user's completion callback issues a new I/O. + */ + spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), + _spdk_bdev_io_complete, bdev_io); + return; + } + + tsc = spdk_get_ticks(); + spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 0); + + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_read_ops++; + bdev_io->internal.ch->stat.read_latency_ticks += (tsc - bdev_io->internal.submit_tsc); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + bdev_io->internal.ch->stat.num_write_ops++; + bdev_io->internal.ch->stat.write_latency_ticks += (tsc - bdev_io->internal.submit_tsc); + break; + default: + break; + } + } + +#ifdef SPDK_CONFIG_VTUNE + uint64_t now_tsc = spdk_get_ticks(); + if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { + uint64_t data[5]; + + data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; + data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; + data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; + data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; + data[4] = bdev_io->bdev->fn_table->get_spin_time ? + bdev_io->bdev->fn_table->get_spin_time(bdev_io->internal.ch->channel) : 0; + + __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, + __itt_metadata_u64, 5, data); + + bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; + bdev_io->internal.ch->start_tsc = now_tsc; + } +#endif + + assert(bdev_io->internal.cb != NULL); + assert(spdk_get_thread() == spdk_io_channel_get_thread(bdev_io->internal.ch->channel)); + + bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, + bdev_io->internal.caller_ctx); +} + +static void +_spdk_bdev_reset_complete(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); + + if (bdev_io->u.reset.ch_ref != NULL) { + spdk_put_io_channel(bdev_io->u.reset.ch_ref); + bdev_io->u.reset.ch_ref = NULL; + } + + _spdk_bdev_io_complete(bdev_io); +} + +static void +_spdk_bdev_unfreeze_channel(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); + + ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; + if (!TAILQ_EMPTY(&ch->queued_resets)) { + _spdk_bdev_channel_start_reset(ch); + } + + spdk_for_each_channel_continue(i, 0); +} + +void +spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) +{ + struct spdk_bdev *bdev = bdev_io->bdev; + struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; + struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; + + bdev_io->internal.status = status; + + if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { + bool unlock_channels = false; + + if (status == SPDK_BDEV_IO_STATUS_NOMEM) { + SPDK_ERRLOG("NOMEM returned for reset\n"); + } + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev_io == bdev->internal.reset_in_progress) { + bdev->internal.reset_in_progress = NULL; + unlock_channels = true; + } + pthread_mutex_unlock(&bdev->internal.mutex); + + if (unlock_channels) { + spdk_for_each_channel(__bdev_to_io_dev(bdev), _spdk_bdev_unfreeze_channel, + bdev_io, _spdk_bdev_reset_complete); + return; + } + } else { + assert(bdev_ch->io_outstanding > 0); + assert(shared_resource->io_outstanding > 0); + bdev_ch->io_outstanding--; + shared_resource->io_outstanding--; + + if (spdk_unlikely(status == SPDK_BDEV_IO_STATUS_NOMEM)) { + TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); + /* + * Wait for some of the outstanding I/O to complete before we + * retry any of the nomem_io. Normally we will wait for + * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue + * depth channels we will instead wait for half to complete. + */ + shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, + (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); + return; + } + + if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { + _spdk_bdev_ch_retry_io(bdev_ch); + } + } + + _spdk_bdev_io_complete(bdev_io); +} + +void +spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, + enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) +{ + if (sc == SPDK_SCSI_STATUS_GOOD) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; + bdev_io->internal.error.scsi.sc = sc; + bdev_io->internal.error.scsi.sk = sk; + bdev_io->internal.error.scsi.asc = asc; + bdev_io->internal.error.scsi.ascq = ascq; + } + + spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +} + +void +spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, + int *sc, int *sk, int *asc, int *ascq) +{ + assert(sc != NULL); + assert(sk != NULL); + assert(asc != NULL); + assert(ascq != NULL); + + switch (bdev_io->internal.status) { + case SPDK_BDEV_IO_STATUS_SUCCESS: + *sc = SPDK_SCSI_STATUS_GOOD; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_BDEV_IO_STATUS_NVME_ERROR: + spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); + break; + case SPDK_BDEV_IO_STATUS_SCSI_ERROR: + *sc = bdev_io->internal.error.scsi.sc; + *sk = bdev_io->internal.error.scsi.sk; + *asc = bdev_io->internal.error.scsi.asc; + *ascq = bdev_io->internal.error.scsi.ascq; + break; + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } +} + +void +spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, int sct, int sc) +{ + if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + } else { + bdev_io->internal.error.nvme.sct = sct; + bdev_io->internal.error.nvme.sc = sc; + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; + } + + spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); +} + +void +spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, int *sct, int *sc) +{ + assert(sct != NULL); + assert(sc != NULL); + + if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { + *sct = bdev_io->internal.error.nvme.sct; + *sc = bdev_io->internal.error.nvme.sc; + } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { + *sct = SPDK_NVME_SCT_GENERIC; + *sc = SPDK_NVME_SC_SUCCESS; + } else { + *sct = SPDK_NVME_SCT_GENERIC; + *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + } +} + +struct spdk_thread * +spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) +{ + return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); +} + +static void +_spdk_bdev_qos_config_limit(struct spdk_bdev *bdev, uint64_t *limits) +{ + uint64_t min_qos_set; + int i; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + break; + } + } + + if (i == SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { + SPDK_ERRLOG("Invalid rate limits set.\n"); + return; + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + continue; + } + + if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { + min_qos_set = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; + } else { + min_qos_set = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; + } + + if (limits[i] == 0 || limits[i] % min_qos_set) { + SPDK_ERRLOG("Assigned limit %" PRIu64 " on bdev %s is not multiple of %" PRIu64 "\n", + limits[i], bdev->name, min_qos_set); + SPDK_ERRLOG("Failed to enable QoS on this bdev %s\n", bdev->name); + return; + } + } + + if (!bdev->internal.qos) { + bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); + if (!bdev->internal.qos) { + SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); + return; + } + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + bdev->internal.qos->rate_limits[i].limit = limits[i]; + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Bdev:%s QoS type:%d set:%lu\n", + bdev->name, i, limits[i]); + } + + return; +} + +static void +_spdk_bdev_qos_config(struct spdk_bdev *bdev) +{ + struct spdk_conf_section *sp = NULL; + const char *val = NULL; + int i = 0, j = 0; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES] = {}; + bool config_qos = false; + + sp = spdk_conf_find_section(NULL, "QoS"); + if (!sp) { + return; + } + + while (j < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES) { + limits[j] = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + + i = 0; + while (true) { + val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 0); + if (!val) { + break; + } + + if (strcmp(bdev->name, val) != 0) { + i++; + continue; + } + + val = spdk_conf_section_get_nmval(sp, qos_conf_type[j], i, 1); + if (val) { + if (_spdk_bdev_qos_is_iops_rate_limit(j) == true) { + limits[j] = strtoull(val, NULL, 10); + } else { + limits[j] = strtoull(val, NULL, 10) * 1024 * 1024; + } + config_qos = true; + } + + break; + } + + j++; + } + + if (config_qos == true) { + _spdk_bdev_qos_config_limit(bdev, limits); + } + + return; +} + +static int +spdk_bdev_init(struct spdk_bdev *bdev) +{ + char *bdev_name; + + assert(bdev->module != NULL); + + if (!bdev->name) { + SPDK_ERRLOG("Bdev name is NULL\n"); + return -EINVAL; + } + + if (spdk_bdev_get_by_name(bdev->name)) { + SPDK_ERRLOG("Bdev name:%s already exists\n", bdev->name); + return -EEXIST; + } + + /* Users often register their own I/O devices using the bdev name. In + * order to avoid conflicts, prepend bdev_. */ + bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); + if (!bdev_name) { + SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); + return -ENOMEM; + } + + bdev->internal.status = SPDK_BDEV_STATUS_READY; + bdev->internal.measured_queue_depth = UINT64_MAX; + bdev->internal.claim_module = NULL; + bdev->internal.qd_poller = NULL; + bdev->internal.qos = NULL; + + TAILQ_INIT(&bdev->internal.open_descs); + + TAILQ_INIT(&bdev->aliases); + + bdev->internal.reset_in_progress = NULL; + + _spdk_bdev_qos_config(bdev); + + spdk_io_device_register(__bdev_to_io_dev(bdev), + spdk_bdev_channel_create, spdk_bdev_channel_destroy, + sizeof(struct spdk_bdev_channel), + bdev_name); + + free(bdev_name); + + pthread_mutex_init(&bdev->internal.mutex, NULL); + return 0; +} + +static void +spdk_bdev_destroy_cb(void *io_device) +{ + int rc; + struct spdk_bdev *bdev; + spdk_bdev_unregister_cb cb_fn; + void *cb_arg; + + bdev = __bdev_from_io_dev(io_device); + cb_fn = bdev->internal.unregister_cb; + cb_arg = bdev->internal.unregister_ctx; + + rc = bdev->fn_table->destruct(bdev->ctxt); + if (rc < 0) { + SPDK_ERRLOG("destruct failed\n"); + } + if (rc <= 0 && cb_fn != NULL) { + cb_fn(cb_arg, rc); + } +} + + +static void +spdk_bdev_fini(struct spdk_bdev *bdev) +{ + pthread_mutex_destroy(&bdev->internal.mutex); + + free(bdev->internal.qos); + + spdk_io_device_unregister(__bdev_to_io_dev(bdev), spdk_bdev_destroy_cb); +} + +static void +spdk_bdev_start(struct spdk_bdev *bdev) +{ + struct spdk_bdev_module *module; + uint32_t action; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Inserting bdev %s into list\n", bdev->name); + TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); + + /* Examine configuration before initializing I/O */ + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_config) { + action = module->internal.action_in_progress; + module->internal.action_in_progress++; + module->examine_config(bdev); + if (action != module->internal.action_in_progress) { + SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", + module->name); + } + } + } + + if (bdev->internal.claim_module) { + return; + } + + TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (module->examine_disk) { + module->internal.action_in_progress++; + module->examine_disk(bdev); + } + } +} + +int +spdk_bdev_register(struct spdk_bdev *bdev) +{ + int rc = spdk_bdev_init(bdev); + + if (rc == 0) { + spdk_bdev_start(bdev); + } + + return rc; +} + +int +spdk_vbdev_register(struct spdk_bdev *vbdev, struct spdk_bdev **base_bdevs, int base_bdev_count) +{ + int rc; + + rc = spdk_bdev_init(vbdev); + if (rc) { + return rc; + } + + spdk_bdev_start(vbdev); + return 0; +} + +void +spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) +{ + if (bdev->internal.unregister_cb != NULL) { + bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); + } +} + +static void +_remove_notify(void *arg) +{ + struct spdk_bdev_desc *desc = arg; + + desc->remove_scheduled = false; + + if (desc->closed) { + free(desc); + } else { + desc->remove_cb(desc->remove_ctx); + } +} + +void +spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev_desc *desc, *tmp; + bool do_destruct = true; + struct spdk_thread *thread; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Removing bdev %s from list\n", bdev->name); + + thread = spdk_get_thread(); + if (!thread) { + /* The user called this from a non-SPDK thread. */ + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENOTSUP); + } + return; + } + + pthread_mutex_lock(&bdev->internal.mutex); + + bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; + bdev->internal.unregister_cb = cb_fn; + bdev->internal.unregister_ctx = cb_arg; + + TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { + if (desc->remove_cb) { + do_destruct = false; + /* + * Defer invocation of the remove_cb to a separate message that will + * run later on its thread. This ensures this context unwinds and + * we don't recursively unregister this bdev again if the remove_cb + * immediately closes its descriptor. + */ + if (!desc->remove_scheduled) { + /* Avoid scheduling removal of the same descriptor multiple times. */ + desc->remove_scheduled = true; + spdk_thread_send_msg(desc->thread, _remove_notify, desc); + } + } + } + + if (!do_destruct) { + pthread_mutex_unlock(&bdev->internal.mutex); + return; + } + + TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); + pthread_mutex_unlock(&bdev->internal.mutex); + + spdk_bdev_fini(bdev); +} + +int +spdk_bdev_open(struct spdk_bdev *bdev, bool write, spdk_bdev_remove_cb_t remove_cb, + void *remove_ctx, struct spdk_bdev_desc **_desc) +{ + struct spdk_bdev_desc *desc; + struct spdk_thread *thread; + + thread = spdk_get_thread(); + if (!thread) { + SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); + return -ENOTSUP; + } + + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) { + SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, + spdk_get_thread()); + + pthread_mutex_lock(&bdev->internal.mutex); + + if (write && bdev->internal.claim_module) { + SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", + bdev->name, bdev->internal.claim_module->name); + free(desc); + pthread_mutex_unlock(&bdev->internal.mutex); + return -EPERM; + } + + TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); + + desc->bdev = bdev; + desc->thread = thread; + desc->remove_cb = remove_cb; + desc->remove_ctx = remove_ctx; + desc->write = write; + *_desc = desc; + + pthread_mutex_unlock(&bdev->internal.mutex); + + return 0; +} + +void +spdk_bdev_close(struct spdk_bdev_desc *desc) +{ + struct spdk_bdev *bdev = desc->bdev; + bool do_unregister = false; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, + spdk_get_thread()); + + assert(desc->thread == spdk_get_thread()); + + pthread_mutex_lock(&bdev->internal.mutex); + + TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); + + desc->closed = true; + + if (!desc->remove_scheduled) { + free(desc); + } + + /* If no more descriptors, kill QoS channel */ + if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", + bdev->name, spdk_get_thread()); + + if (spdk_bdev_qos_destroy(bdev)) { + /* There isn't anything we can do to recover here. Just let the + * old QoS poller keep running. The QoS handling won't change + * cores when the user allocates a new channel, but it won't break. */ + SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); + } + } + + spdk_bdev_set_qd_sampling_period(bdev, 0); + + if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { + do_unregister = true; + } + pthread_mutex_unlock(&bdev->internal.mutex); + + if (do_unregister == true) { + spdk_bdev_unregister(bdev, bdev->internal.unregister_cb, bdev->internal.unregister_ctx); + } +} + +int +spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_bdev_module *module) +{ + if (bdev->internal.claim_module != NULL) { + SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, + bdev->internal.claim_module->name); + return -EPERM; + } + + if (desc && !desc->write) { + desc->write = true; + } + + bdev->internal.claim_module = module; + return 0; +} + +void +spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) +{ + assert(bdev->internal.claim_module != NULL); + bdev->internal.claim_module = NULL; +} + +struct spdk_bdev * +spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) +{ + return desc->bdev; +} + +void +spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) +{ + struct iovec *iovs; + int iovcnt; + + if (bdev_io == NULL) { + return; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + iovs = bdev_io->u.bdev.iovs; + iovcnt = bdev_io->u.bdev.iovcnt; + break; + case SPDK_BDEV_IO_TYPE_WRITE: + iovs = bdev_io->u.bdev.iovs; + iovcnt = bdev_io->u.bdev.iovcnt; + break; + default: + iovs = NULL; + iovcnt = 0; + break; + } + + if (iovp) { + *iovp = iovs; + } + if (iovcntp) { + *iovcntp = iovcnt; + } +} + +void +spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) +{ + + if (spdk_bdev_module_list_find(bdev_module->name)) { + SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); + assert(false); + } + + if (bdev_module->async_init) { + bdev_module->internal.action_in_progress = 1; + } + + /* + * Modules with examine callbacks must be initialized first, so they are + * ready to handle examine callbacks from later modules that will + * register physical bdevs. + */ + if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { + TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); + } else { + TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); + } +} + +struct spdk_bdev_module * +spdk_bdev_module_list_find(const char *name) +{ + struct spdk_bdev_module *bdev_module; + + TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { + if (strcmp(name, bdev_module->name) == 0) { + break; + } + } + + return bdev_module; +} + +static void +_spdk_bdev_write_zero_buffer_next(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + uint64_t num_bytes, num_blocks; + int rc; + + num_bytes = spdk_min(spdk_bdev_get_block_size(bdev_io->bdev) * + bdev_io->u.bdev.split_remaining_num_blocks, + ZERO_BUFFER_SIZE); + num_blocks = num_bytes / spdk_bdev_get_block_size(bdev_io->bdev); + + rc = spdk_bdev_write_blocks(bdev_io->internal.desc, + spdk_io_channel_from_ctx(bdev_io->internal.ch), + g_bdev_mgr.zero_buffer, + bdev_io->u.bdev.split_current_offset_blocks, num_blocks, + _spdk_bdev_write_zero_buffer_done, bdev_io); + if (rc == 0) { + bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; + bdev_io->u.bdev.split_current_offset_blocks += num_blocks; + } else if (rc == -ENOMEM) { + _spdk_bdev_queue_io_wait_with_cb(bdev_io, _spdk_bdev_write_zero_buffer_next); + } else { + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); + } +} + +static void +_spdk_bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); + return; + } + + if (parent_io->u.bdev.split_remaining_num_blocks == 0) { + parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; + parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); + return; + } + + _spdk_bdev_write_zero_buffer_next(parent_io); +} + +struct set_qos_limit_ctx { + void (*cb_fn)(void *cb_arg, int status); + void *cb_arg; + struct spdk_bdev *bdev; +}; + +static void +_spdk_bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) +{ + pthread_mutex_lock(&ctx->bdev->internal.mutex); + ctx->bdev->internal.qos_mod_in_progress = false; + pthread_mutex_unlock(&ctx->bdev->internal.mutex); + + ctx->cb_fn(ctx->cb_arg, status); + free(ctx); +} + +static void +_spdk_bdev_disable_qos_done(void *cb_arg) +{ + struct set_qos_limit_ctx *ctx = cb_arg; + struct spdk_bdev *bdev = ctx->bdev; + struct spdk_bdev_io *bdev_io; + struct spdk_bdev_qos *qos; + + pthread_mutex_lock(&bdev->internal.mutex); + qos = bdev->internal.qos; + bdev->internal.qos = NULL; + pthread_mutex_unlock(&bdev->internal.mutex); + + while (!TAILQ_EMPTY(&qos->queued)) { + /* Send queued I/O back to their original thread for resubmission. */ + bdev_io = TAILQ_FIRST(&qos->queued); + TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); + + if (bdev_io->internal.io_submit_ch) { + /* + * Channel was changed when sending it to the QoS thread - change it back + * before sending it back to the original thread. + */ + bdev_io->internal.ch = bdev_io->internal.io_submit_ch; + bdev_io->internal.io_submit_ch = NULL; + } + + spdk_thread_send_msg(spdk_io_channel_get_thread(bdev_io->internal.ch->channel), + _spdk_bdev_io_submit, bdev_io); + } + + spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); + spdk_poller_unregister(&qos->poller); + + free(qos); + + _spdk_bdev_set_qos_limit_done(ctx, 0); +} + +static void +_spdk_bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_thread *thread; + + pthread_mutex_lock(&bdev->internal.mutex); + thread = bdev->internal.qos->thread; + pthread_mutex_unlock(&bdev->internal.mutex); + + spdk_thread_send_msg(thread, _spdk_bdev_disable_qos_done, ctx); +} + +static void +_spdk_bdev_disable_qos_msg(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); + + bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; + + spdk_for_each_channel_continue(i, 0); +} + +static void +_spdk_bdev_update_qos_rate_limit_msg(void *cb_arg) +{ + struct set_qos_limit_ctx *ctx = cb_arg; + struct spdk_bdev *bdev = ctx->bdev; + + pthread_mutex_lock(&bdev->internal.mutex); + spdk_bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); + pthread_mutex_unlock(&bdev->internal.mutex); + + _spdk_bdev_set_qos_limit_done(ctx, 0); +} + +static void +_spdk_bdev_enable_qos_msg(struct spdk_io_channel_iter *i) +{ + void *io_device = spdk_io_channel_iter_get_io_device(i); + struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); + + pthread_mutex_lock(&bdev->internal.mutex); + _spdk_bdev_enable_qos(bdev, bdev_ch); + pthread_mutex_unlock(&bdev->internal.mutex); + spdk_for_each_channel_continue(i, 0); +} + +static void +_spdk_bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) +{ + struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + _spdk_bdev_set_qos_limit_done(ctx, status); +} + +static void +_spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) +{ + int i; + + assert(bdev->internal.qos != NULL); + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + bdev->internal.qos->rate_limits[i].limit = limits[i]; + + if (limits[i] == 0) { + bdev->internal.qos->rate_limits[i].limit = + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; + } + } + } +} + +void +spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, + void (*cb_fn)(void *cb_arg, int status), void *cb_arg) +{ + struct set_qos_limit_ctx *ctx; + uint32_t limit_set_complement; + uint64_t min_limit_per_sec; + int i; + bool disable_rate_limit = true; + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { + continue; + } + + if (limits[i] > 0) { + disable_rate_limit = false; + } + + if (_spdk_bdev_qos_is_iops_rate_limit(i) == true) { + min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; + } else { + /* Change from megabyte to byte rate limit */ + limits[i] = limits[i] * 1024 * 1024; + min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; + } + + limit_set_complement = limits[i] % min_limit_per_sec; + if (limit_set_complement) { + SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", + limits[i], min_limit_per_sec); + limits[i] += min_limit_per_sec - limit_set_complement; + SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); + } + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->bdev = bdev; + + pthread_mutex_lock(&bdev->internal.mutex); + if (bdev->internal.qos_mod_in_progress) { + pthread_mutex_unlock(&bdev->internal.mutex); + free(ctx); + cb_fn(cb_arg, -EAGAIN); + return; + } + bdev->internal.qos_mod_in_progress = true; + + if (disable_rate_limit == true && bdev->internal.qos) { + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && + (bdev->internal.qos->rate_limits[i].limit > 0 && + bdev->internal.qos->rate_limits[i].limit != + SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { + disable_rate_limit = false; + break; + } + } + } + + if (disable_rate_limit == false) { + if (bdev->internal.qos == NULL) { + /* Enabling */ + bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); + if (!bdev->internal.qos) { + pthread_mutex_unlock(&bdev->internal.mutex); + SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + _spdk_bdev_set_qos_rate_limits(bdev, limits); + + spdk_for_each_channel(__bdev_to_io_dev(bdev), + _spdk_bdev_enable_qos_msg, ctx, + _spdk_bdev_enable_qos_done); + } else { + /* Updating */ + _spdk_bdev_set_qos_rate_limits(bdev, limits); + + spdk_thread_send_msg(bdev->internal.qos->thread, + _spdk_bdev_update_qos_rate_limit_msg, ctx); + } + } else { + if (bdev->internal.qos != NULL) { + _spdk_bdev_set_qos_rate_limits(bdev, limits); + + /* Disabling */ + spdk_for_each_channel(__bdev_to_io_dev(bdev), + _spdk_bdev_disable_qos_msg, ctx, + _spdk_bdev_disable_qos_msg_done); + } else { + pthread_mutex_unlock(&bdev->internal.mutex); + _spdk_bdev_set_qos_limit_done(ctx, 0); + return; + } + } + + pthread_mutex_unlock(&bdev->internal.mutex); +} + +SPDK_LOG_REGISTER_COMPONENT("bdev", SPDK_LOG_BDEV) + +SPDK_TRACE_REGISTER_FN(bdev_trace) +{ + spdk_trace_register_owner(OWNER_BDEV, 'b'); + spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); + spdk_trace_register_description("BDEV_IO_START", "", TRACE_BDEV_IO_START, OWNER_BDEV, + OBJECT_BDEV_IO, 1, 0, "type: "); + spdk_trace_register_description("BDEV_IO_DONE", "", TRACE_BDEV_IO_DONE, OWNER_BDEV, + OBJECT_BDEV_IO, 0, 0, ""); +} diff --git a/src/spdk/lib/bdev/crypto/Makefile b/src/spdk/lib/bdev/crypto/Makefile new file mode 100644 index 00000000..c3eb1b74 --- /dev/null +++ b/src/spdk/lib/bdev/crypto/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vbdev_crypto.c vbdev_crypto_rpc.c +LIBNAME = vbdev_crypto + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/crypto/vbdev_crypto.c b/src/spdk/lib/bdev/crypto/vbdev_crypto.c new file mode 100644 index 00000000..510e8496 --- /dev/null +++ b/src/spdk/lib/bdev/crypto/vbdev_crypto.c @@ -0,0 +1,1506 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUcryptoION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_crypto.h" + +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/io_channel.h" +#include "spdk/bdev_module.h" + +#include +#include +#include +#include +#include + +/* To add support for new device types, follow the examples of the following... + * Note that the string names are defined by the DPDK PMD in question so be + * sure to use the exact names. + */ +#define MAX_NUM_DRV_TYPES 2 +#define AESNI_MB "crypto_aesni_mb" +#define QAT "crypto_qat" +const char *g_driver_names[MAX_NUM_DRV_TYPES] = { AESNI_MB, QAT }; + +/* Global list of available crypto devices. */ +struct vbdev_dev { + struct rte_cryptodev_info cdev_info; /* includes device friendly name */ + uint8_t cdev_id; /* identifier for the device */ + TAILQ_ENTRY(vbdev_dev) link; +}; +static TAILQ_HEAD(, vbdev_dev) g_vbdev_devs = TAILQ_HEAD_INITIALIZER(g_vbdev_devs); + +/* Global list and lock for unique device/queue pair combos */ +struct device_qp { + struct vbdev_dev *device; /* ptr to crypto device */ + uint8_t qp; /* queue pair for this node */ + bool in_use; /* whether this node is in use or not */ + TAILQ_ENTRY(device_qp) link; +}; +static TAILQ_HEAD(, device_qp) g_device_qp = TAILQ_HEAD_INITIALIZER(g_device_qp); +static pthread_mutex_t g_device_qp_lock = PTHREAD_MUTEX_INITIALIZER; + + +/* In order to limit the number of resources we need to do one crypto + * operation per LBA (we use LBA as IV), we tell the bdev layer that + * our max IO size is something reasonable. Units here are in bytes. + */ +#define CRYPTO_MAX_IO (64 * 1024) + +/* This controls how many ops will be dequeued from the crypto driver in one run + * of the poller. It is mainly a performance knob as it effectively determines how + * much work the poller has to do. However even that can vary between crypto drivers + * as the AESNI_MB driver for example does all the crypto work on dequeue whereas the + * QAT drvier just dequeues what has been completed already. + */ +#define MAX_DEQUEUE_BURST_SIZE 64 + +/* When enqueueing, we need to supply the crypto driver with an array of pointers to + * operation structs. As each of these can be max 512B, we can adjust the CRYPTO_MAX_IO + * value in conjunction with the the other defines to make sure we're not using crazy amounts + * of memory. All of these numbers can and probably should be adjusted based on the + * workload. By default we'll use the worst case (smallest) block size for the + * minimum number of array entries. As an example, a CRYPTO_MAX_IO size of 64K with 512B + * blocks would give us an enqueue array size of 128. + */ +#define MAX_ENQUEUE_ARRAY_SIZE (CRYPTO_MAX_IO / 512) + +/* The number of MBUFS we need must be a power of two and to support other small IOs + * in addition to the limits mentioned above, we go to the next power of two. It is + * big number because it is one mempool for source and desitnation mbufs. It may + * need to be bigger to support multiple crypto drivers at once. + */ +#define NUM_MBUFS 32768 +#define POOL_CACHE_SIZE 256 +#define NUM_SESSIONS NUM_MBUFS +#define SESS_MEMPOOL_CACHE_SIZE 256 + +/* This is the max number of IOs we can supply to any crypto device QP at one time. + * It can vary between drivers. + */ +#define CRYPTO_QP_DESCRIPTORS 2048 + +/* Specific to AES_CBC. */ +#define AES_CBC_IV_LENGTH 16 +#define AES_CBC_KEY_LENGTH 16 + +/* Common for suported devices. */ +#define IV_OFFSET (sizeof(struct rte_crypto_op) + \ + sizeof(struct rte_crypto_sym_op)) + +static void _complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void _complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void _complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); +static void vbdev_crypto_examine(struct spdk_bdev *bdev); +static int vbdev_crypto_claim(struct spdk_bdev *bdev); + +/* list of crypto_bdev names and their base bdevs via configuration file. + * Used so we can parse the conf once at init and use this list in examine(). + */ +struct bdev_names { + char *vbdev_name; /* name of the vbdev to create */ + char *bdev_name; /* base bdev name */ + + /* Note, for dev/test we allow use of key in the config file, for production + * use, you must use an RPC to specify the key for security reasons. + */ + uint8_t *key; /* key per bdev */ + char *drv_name; /* name of the crypto device driver */ + TAILQ_ENTRY(bdev_names) link; +}; +static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); + +/* List of virtual bdevs and associated info for each. We keep the device friendly name here even + * though its also in the device struct because we use it early on. + */ +struct vbdev_crypto { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_bdev crypto_bdev; /* the crypto virtual bdev */ + uint8_t *key; /* key per bdev */ + char *drv_name; /* name of the crypto device driver */ + TAILQ_ENTRY(vbdev_crypto) link; +}; +static TAILQ_HEAD(, vbdev_crypto) g_vbdev_crypto = TAILQ_HEAD_INITIALIZER(g_vbdev_crypto); + +/* Shared mempools between all devices on this system */ +static struct spdk_mempool *g_session_mp = NULL; /* session mempool */ +static struct spdk_mempool *g_mbuf_mp = NULL; /* mbuf mempool */ +static struct rte_mempool *g_crypto_op_mp = NULL; /* crypto operations, must be rte* mempool */ + +/* The crypto vbdev channel struct. It is allocated and freed on my behalf by the io channel code. + * We store things in here that are needed on per thread basis like the base_channel for this thread, + * and the poller for this thread. + */ +struct crypto_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ + struct spdk_poller *poller; /* completion poller */ + struct device_qp *device_qp; /* unique device/qp combination for this channel */ +}; + +/* This is the crypto per IO context that the bdev layer allocates for us opaquely and attaches to + * each IO for us. + */ +struct crypto_bdev_io { + int cryop_cnt_remaining; /* counter used when completing crypto ops */ + struct crypto_io_channel *crypto_ch; /* need to store for crypto completion handling */ + struct vbdev_crypto *crypto_bdev; /* the crypto node struct associated with this IO */ + enum rte_crypto_cipher_operation crypto_op; /* the crypto control struct */ + struct rte_crypto_sym_xform cipher_xform; /* crypto control struct for this IO */ + struct spdk_bdev_io *orig_io; /* the original IO */ + struct spdk_bdev_io *read_io; /* the read IO we issued */ + + /* Used for the single contigous buffer that serves as the crypto destination target for writes */ + uint64_t cry_num_blocks; /* num of blocks for the contiguous buffer */ + uint64_t cry_offset_blocks; /* block offset on media */ + struct iovec cry_iov; /* iov representing contig write buffer */ +}; + +/* This is called from the module's init function. We setup all crypto devices early on as we are unable + * to easily dynamically configure queue pairs after the drivers are up and running. So, here, we + * configure the max capabilities of each device and assign threads to queue pairs as channels are + * requested. + */ +static int +vbdev_crypto_init_crypto_drivers(void) +{ + uint8_t cdev_count; + uint8_t cdrv_id, cdev_id, i, j; + int rc = 0; + struct vbdev_dev *device = NULL; + struct device_qp *dev_qp = NULL; + unsigned int max_sess_size = 0, sess_size; + uint16_t num_lcores = rte_lcore_count(); + + /* Only the first call, via RPC or module init should init the crypto drivers. */ + if (g_session_mp != NULL) { + return 0; + } + + /* We always init AESNI_MB */ + rc = rte_vdev_init(AESNI_MB, NULL); + if (rc == 0) { + SPDK_NOTICELOG("created virtual PMD %s\n", AESNI_MB); + } else { + SPDK_ERRLOG("error creating virtual PMD %s\n", AESNI_MB); + return -EINVAL; + } + + /* If we have no crypto devices, there's no reason to continue. */ + cdev_count = rte_cryptodev_count(); + if (cdev_count == 0) { + return 0; + } + + /* + * Create global mempools, shared by all devices regardless of type. + */ + + /* First determine max session size, most pools are shared by all the devices, + * so we need to find the global max sessions size. + */ + for (cdev_id = 0; cdev_id < cdev_count; cdev_id++) { + sess_size = rte_cryptodev_sym_get_private_session_size(cdev_id); + if (sess_size > max_sess_size) { + max_sess_size = sess_size; + } + } + + g_session_mp = spdk_mempool_create("session_mp", NUM_SESSIONS * 2, max_sess_size, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (g_session_mp == NULL) { + SPDK_ERRLOG("Cannot create session pool max size 0x%x\n", max_sess_size); + return -ENOMEM; + } + + g_mbuf_mp = spdk_mempool_create("mbuf_mp", NUM_MBUFS, sizeof(struct rte_mbuf), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (g_mbuf_mp == NULL) { + SPDK_ERRLOG("Cannot create mbuf pool\n"); + rc = -ENOMEM; + goto error_create_mbuf; + } + + g_crypto_op_mp = rte_crypto_op_pool_create("op_mp", + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + NUM_MBUFS, + POOL_CACHE_SIZE, + AES_CBC_IV_LENGTH, + rte_socket_id()); + if (g_crypto_op_mp == NULL) { + SPDK_ERRLOG("Cannot create op pool\n"); + rc = -ENOMEM; + goto error_create_op; + } + + /* + * Now lets configure each device. + */ + for (i = 0; i < cdev_count; i++) { + device = calloc(1, sizeof(struct vbdev_dev)); + if (!device) { + rc = -ENOMEM; + goto error_create_device; + } + + /* Get details about this device. */ + rte_cryptodev_info_get(i, &device->cdev_info); + cdrv_id = device->cdev_info.driver_id; + cdev_id = device->cdev_id = i; + + /* Before going any further, make sure we have enough resources for this + * device type to function. We need a unique queue pair per core accross each + * device type to remain lockless.... + */ + if ((rte_cryptodev_device_count_by_driver(cdrv_id) * + device->cdev_info.max_nb_queue_pairs) < num_lcores) { + SPDK_ERRLOG("Insufficient unique queue pairs available for %s\n", + device->cdev_info.driver_name); + SPDK_ERRLOG("Either add more crypto devices or decrease core count\n"); + rc = -EINVAL; + goto error_qp; + } + + /* Setup queue pairs. */ + struct rte_cryptodev_config conf = { + .nb_queue_pairs = device->cdev_info.max_nb_queue_pairs, + .socket_id = SPDK_ENV_SOCKET_ID_ANY + }; + + rc = rte_cryptodev_configure(cdev_id, &conf); + if (rc < 0) { + SPDK_ERRLOG("Failed to configure cryptodev %u", cdev_id); + rc = -EINVAL; + goto error_dev_config; + } + + struct rte_cryptodev_qp_conf qp_conf = { + .nb_descriptors = CRYPTO_QP_DESCRIPTORS + }; + + /* Pre-setup all pottential qpairs now and assign them in the channel + * callback. If we were to create them there, we'd have to stop the + * entire device affecting all other threads that might be using it + * even on other queue pairs. + */ + for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) { + rc = rte_cryptodev_queue_pair_setup(cdev_id, j, &qp_conf, SOCKET_ID_ANY, + (struct rte_mempool *)g_session_mp); + + if (rc < 0) { + SPDK_ERRLOG("Failed to setup queue pair %u on " + "cryptodev %u", j, cdev_id); + rc = -EINVAL; + goto error_qp_setup; + } + } + + rc = rte_cryptodev_start(cdev_id); + if (rc < 0) { + SPDK_ERRLOG("Failed to start device %u: error %d\n", + cdev_id, rc); + rc = -EINVAL; + goto error_device_start; + } + + /* Add to our list of available crypto devices. */ + TAILQ_INSERT_TAIL(&g_vbdev_devs, device, link); + + /* Build up list of device/qp combinations */ + for (j = 0; j < device->cdev_info.max_nb_queue_pairs; j++) { + dev_qp = calloc(1, sizeof(struct device_qp)); + if (!dev_qp) { + rc = -ENOMEM; + goto error_create_devqp; + } + dev_qp->device = device; + dev_qp->qp = j; + dev_qp->in_use = false; + TAILQ_INSERT_TAIL(&g_device_qp, dev_qp, link); + } + } + return 0; + + /* Error cleanup paths. */ +error_create_devqp: + while ((dev_qp = TAILQ_FIRST(&g_device_qp))) { + TAILQ_REMOVE(&g_device_qp, dev_qp, link); + free(dev_qp); + } +error_device_start: +error_qp_setup: +error_dev_config: +error_qp: + free(device); +error_create_device: + rte_mempool_free(g_crypto_op_mp); +error_create_op: + spdk_mempool_free(g_mbuf_mp); +error_create_mbuf: + spdk_mempool_free(g_session_mp); + return rc; +} + +/* Following an encrypt or decrypt we need to then either write the encrypted data or finish + * the read on decrypted data. Do that here. + */ +static void +_crypto_operation_complete(struct spdk_bdev_io *bdev_io) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch; + struct spdk_bdev_io *free_me = io_ctx->read_io; + int rc = 0; + + if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_FAILED) { + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + + /* Complete the original IO and then free the one that we created + * as a result of issuing an IO via submit_reqeust. + */ + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + spdk_bdev_free_io(free_me); + + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + + /* Write the encrypted data. */ + rc = spdk_bdev_writev_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + &io_ctx->cry_iov, 1, io_ctx->cry_offset_blocks, + io_ctx->cry_num_blocks, _complete_internal_write, + bdev_io); + } else { + + /* Something really went haywire if this function got called with a type + * other than read or write. + */ + rc = -1; + } + } else { + /* If the poller found that one of the crypto ops had failed as part of this + * bdev_io it would have updated the internal status indicate failure. + */ + rc = -1; + } + + if (rc != 0) { + SPDK_ERRLOG("ERROR on crypto operation completion!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + +} + +/* This is the poller for the crypto device. It uses a single API to dequeue whatever is ready at + * the device. Then we need to decide if what we've got so far (including previous poller + * runs) totals up to one or more complete bdev_ios and if so continue with the bdev_io + * accordingly. This means either completing a read or issuing a new write. + */ +static int +crypto_dev_poller(void *args) +{ + struct crypto_io_channel *crypto_ch = args; + uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id; + int i, num_dequeued_ops; + struct spdk_bdev_io *bdev_io = NULL; + struct crypto_bdev_io *io_ctx = NULL; + struct rte_crypto_op *dequeued_ops[MAX_DEQUEUE_BURST_SIZE]; + struct rte_crypto_op *mbufs_to_free[2 * MAX_DEQUEUE_BURST_SIZE]; + int num_mbufs = 0; + + /* Each run of the poller will get just what the device has available + * at the moment we call it, we don't check again after draining the + * first batch. + */ + num_dequeued_ops = rte_cryptodev_dequeue_burst(cdev_id, crypto_ch->device_qp->qp, + dequeued_ops, MAX_DEQUEUE_BURST_SIZE); + + /* Check if operation was processed successfully */ + for (i = 0; i < num_dequeued_ops; i++) { + + /* We don't know the order or association of the crypto ops wrt any + * partiular bdev_io so need to look at each and determine if it's + * the last one for it's bdev_io or not. + */ + bdev_io = (struct spdk_bdev_io *)dequeued_ops[i]->sym->m_src->userdata; + assert(bdev_io != NULL); + + if (dequeued_ops[i]->status != RTE_CRYPTO_OP_STATUS_SUCCESS) { + SPDK_ERRLOG("error with op %d status %u\n", i, + dequeued_ops[i]->status); + /* Update the bdev status to error, we'll still process the + * rest of the crypto ops for this bdev_io though so they + * aren't left hanging. + */ + bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; + } + + io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + assert(io_ctx->cryop_cnt_remaining > 0); + + /* Return the associated src and dst mbufs by collecting them into + * an array that we can use the bulk API to free after the loop. + */ + dequeued_ops[i]->sym->m_src->userdata = NULL; + mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_src; + if (dequeued_ops[i]->sym->m_dst) { + mbufs_to_free[num_mbufs++] = (void *)dequeued_ops[i]->sym->m_dst; + } + + /* done encrypting, complete the bdev_io */ + if (--io_ctx->cryop_cnt_remaining == 0) { + + /* Complete the IO */ + _crypto_operation_complete(bdev_io); + + /* Return session */ + rte_cryptodev_sym_session_clear(cdev_id, dequeued_ops[i]->sym->session); + rte_cryptodev_sym_session_free(dequeued_ops[i]->sym->session); + } + } + + /* Now bulk free both mbufs and crypto operations. */ + if (num_dequeued_ops > 0) { + rte_mempool_put_bulk(g_crypto_op_mp, + (void **)dequeued_ops, + num_dequeued_ops); + assert(num_mbufs > 0); + spdk_mempool_put_bulk(g_mbuf_mp, + (void **)mbufs_to_free, + num_mbufs); + } + + return num_dequeued_ops; +} + +/* We're either encrypting on the way down or decrypting on the way back. */ +static int +_crypto_operation(struct spdk_bdev_io *bdev_io, enum rte_crypto_cipher_operation crypto_op) +{ + struct rte_cryptodev_sym_session *session; + uint16_t num_enqueued_ops = 0; + uint32_t cryop_cnt = bdev_io->u.bdev.num_blocks; + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + struct crypto_io_channel *crypto_ch = io_ctx->crypto_ch; + uint8_t cdev_id = crypto_ch->device_qp->device->cdev_id; + uint32_t crypto_len = io_ctx->crypto_bdev->crypto_bdev.blocklen; + uint64_t total_length = bdev_io->u.bdev.num_blocks * crypto_len; + int rc; + uint32_t enqueued = 0; + uint32_t iov_index = 0; + uint32_t allocated = 0; + uint8_t *current_iov = NULL; + uint64_t total_remaining = 0; + uint64_t current_iov_remaining = 0; + int completed = 0; + int crypto_index = 0; + uint32_t en_offset = 0; + struct rte_crypto_op *crypto_ops[MAX_ENQUEUE_ARRAY_SIZE]; + struct rte_mbuf *src_mbufs[MAX_ENQUEUE_ARRAY_SIZE]; + struct rte_mbuf *dst_mbufs[MAX_ENQUEUE_ARRAY_SIZE]; + int burst; + + assert((bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen) <= CRYPTO_MAX_IO); + + /* Get the number of source mbufs that we need. These will always be 1:1 because we + * don't support chaining. The reason we don't is because of our decision to use + * LBA as IV, there can be no case where we'd need >1 mbuf per crypto op or the + * op would be > 1 LBA. + */ + rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&src_mbufs[0], cryop_cnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get src_mbufs!\n"); + return -ENOMEM; + } + + /* Get the same amount but these buffers to describe the encrypted data location (dst). */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + rc = spdk_mempool_get_bulk(g_mbuf_mp, (void **)&dst_mbufs[0], cryop_cnt); + if (rc) { + SPDK_ERRLOG("ERROR trying to get dst_mbufs!\n"); + rc = -ENOMEM; + goto error_get_dst; + } + } + + /* Allocate crypto operations. */ + allocated = rte_crypto_op_bulk_alloc(g_crypto_op_mp, + RTE_CRYPTO_OP_TYPE_SYMMETRIC, + crypto_ops, cryop_cnt); + if (allocated < cryop_cnt) { + SPDK_ERRLOG("ERROR trying to get crypto ops!\n"); + rc = -ENOMEM; + goto error_get_ops; + } + + /* Get sessions. */ + session = rte_cryptodev_sym_session_create((struct rte_mempool *)g_session_mp); + if (NULL == session) { + SPDK_ERRLOG("ERROR trying to create crypto session!\n"); + rc = -EINVAL; + goto error_session_create; + } + + /* Init our session with the desired cipher options. */ + io_ctx->cipher_xform.type = RTE_CRYPTO_SYM_XFORM_CIPHER; + io_ctx->cipher_xform.cipher.key.data = io_ctx->crypto_bdev->key; + io_ctx->cipher_xform.cipher.op = io_ctx->crypto_op = crypto_op; + io_ctx->cipher_xform.cipher.iv.offset = IV_OFFSET; + io_ctx->cipher_xform.cipher.algo = RTE_CRYPTO_CIPHER_AES_CBC; + io_ctx->cipher_xform.cipher.key.length = AES_CBC_KEY_LENGTH; + io_ctx->cipher_xform.cipher.iv.length = AES_CBC_IV_LENGTH; + + rc = rte_cryptodev_sym_session_init(cdev_id, session, + &io_ctx->cipher_xform, + (struct rte_mempool *)g_session_mp); + if (rc < 0) { + SPDK_ERRLOG("ERROR trying to init crypto session!\n"); + rc = -EINVAL; + goto error_session_init; + } + + /* For encryption, we need to prepare a single contiguous buffer as the encryption + * destination, we'll then pass that along for the write after encryption is done. + * This is done to avoiding encrypting the provided write buffer which may be + * undesirable in some use cases. + */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + io_ctx->cry_iov.iov_len = total_length; + /* For now just allocate in the I/O path, not optimal but the current bdev API + * for getting a buffer from the pool won't work if the bdev_io passed in + * has a buffer, which ours always will. So, until we modify that API + * or better yet the current ZCOPY work lands, this is the best we can do. + */ + io_ctx->cry_iov.iov_base = spdk_dma_malloc(total_length, 0x10, NULL); + if (!io_ctx->cry_iov.iov_base) { + SPDK_ERRLOG("ERROR trying to allocate write buffer for encryption!\n"); + rc = -ENOMEM; + goto error_get_write_buffer; + } + io_ctx->cry_offset_blocks = bdev_io->u.bdev.offset_blocks; + io_ctx->cry_num_blocks = bdev_io->u.bdev.num_blocks; + } + + /* This value is used in the completion callback to determine when the bdev_io is + * complete. + */ + io_ctx->cryop_cnt_remaining = cryop_cnt; + + /* As we don't support chaining because of a decision to use LBA as IV, construction + * of crypto operaations is straightforward. We build both the op, the mbuf and the + * dst_mbuf in our local arrays by looping through the length of the bdev IO and + * picking off LBA sized blocks of memory from the IOVs as we walk through them. Each + * LBA sized chunck of memory will correspond 1:1 to a crypto operation and a single + * mbuf per crypto operation. + */ + total_remaining = total_length; + current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base; + current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len; + do { + uint8_t *iv_ptr; + uint64_t op_block_offset; + + /* Set the mbuf elements address and length. Null out the next pointer. */ + src_mbufs[crypto_index]->buf_addr = current_iov; + src_mbufs[crypto_index]->buf_iova = spdk_vtophys((void *)current_iov); + src_mbufs[crypto_index]->data_len = crypto_len; + src_mbufs[crypto_index]->next = NULL; + /* Store context in every mbuf as we don't know anything about completion order */ + src_mbufs[crypto_index]->userdata = bdev_io; + + /* Set the IV - we use the LBA of the crypto_op */ + iv_ptr = rte_crypto_op_ctod_offset(crypto_ops[crypto_index], uint8_t *, + IV_OFFSET); + memset(iv_ptr, 0, AES_CBC_IV_LENGTH); + op_block_offset = bdev_io->u.bdev.offset_blocks + crypto_index; + rte_memcpy(iv_ptr, &op_block_offset, sizeof(uint64_t)); + + /* Set the data to encrypt/decrypt length */ + crypto_ops[crypto_index]->sym->cipher.data.length = crypto_len; + crypto_ops[crypto_index]->sym->cipher.data.offset = 0; + + /* link the mbuf to the crypto op. */ + crypto_ops[crypto_index]->sym->m_src = src_mbufs[crypto_index]; + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + crypto_ops[crypto_index]->sym->m_dst = src_mbufs[crypto_index]; + } else { + crypto_ops[crypto_index]->sym->m_dst = NULL; + } + + /* For encrypt, point the destination to a buffer we allocate and redirect the bdev_io + * that will be used to process the write on completion to the same buffer. Setting + * up the en_buffer is a little simpler as we know the destination buffer is single IOV. + */ + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + + /* Set the relevant destination en_mbuf elements. */ + dst_mbufs[crypto_index]->buf_addr = io_ctx->cry_iov.iov_base + en_offset; + dst_mbufs[crypto_index]->buf_iova = spdk_vtophys(dst_mbufs[crypto_index]->buf_addr); + dst_mbufs[crypto_index]->data_len = crypto_len; + crypto_ops[crypto_index]->sym->m_dst = dst_mbufs[crypto_index]; + en_offset += crypto_len; + dst_mbufs[crypto_index]->next = NULL; + } + + /* Attach the crypto session to the operation */ + rc = rte_crypto_op_attach_sym_session(crypto_ops[crypto_index], session); + if (rc) { + rc = -EINVAL; + goto error_attach_session; + } + + /* Subtract our running totals for the op in progress and the overall bdev io */ + total_remaining -= crypto_len; + current_iov_remaining -= crypto_len; + + /* move our current IOV pointer accordingly. */ + current_iov += crypto_len; + + /* move on to the next crypto operation */ + crypto_index++; + + /* If we're done with this IOV, move to the next one. */ + if (current_iov_remaining == 0 && total_remaining > 0) { + iov_index++; + current_iov = bdev_io->u.bdev.iovs[iov_index].iov_base; + current_iov_remaining = bdev_io->u.bdev.iovs[iov_index].iov_len; + } + } while (total_remaining > 0); + + /* Enqueue everything we've got but limit by the max number of descriptors we + * configured the crypto device for. + */ + do { + burst = spdk_min((cryop_cnt - enqueued), CRYPTO_QP_DESCRIPTORS); + num_enqueued_ops = rte_cryptodev_enqueue_burst(cdev_id, crypto_ch->device_qp->qp, + &crypto_ops[enqueued], + burst); + enqueued += num_enqueued_ops; + + /* Dequeue all inline if the device is full. We don't defer anything simply + * because of the complexity involved as we're building 1 or more crypto + * ops per IO. Dequeue will free up space for more enqueue. + */ + if (enqueued < cryop_cnt) { + + /* Dequeue everything, this may include ops that were already + * in the device before this submission.... + */ + do { + completed = crypto_dev_poller(crypto_ch); + } while (completed > 0); + } + } while (enqueued < cryop_cnt); + + return rc; + + /* Error cleanup paths. */ +error_attach_session: +error_get_write_buffer: +error_session_init: + rte_cryptodev_sym_session_clear(cdev_id, session); + rte_cryptodev_sym_session_free(session); +error_session_create: + rte_mempool_put_bulk(g_crypto_op_mp, (void **)crypto_ops, cryop_cnt); + allocated = 0; +error_get_ops: + if (crypto_op == RTE_CRYPTO_CIPHER_OP_ENCRYPT) { + spdk_mempool_put_bulk(g_mbuf_mp, (void **)&dst_mbufs[0], + cryop_cnt); + } + if (allocated > 0) { + rte_mempool_put_bulk(g_crypto_op_mp, (void **)crypto_ops, + allocated); + } +error_get_dst: + spdk_mempool_put_bulk(g_mbuf_mp, (void **)&src_mbufs[0], + cryop_cnt); + return rc; +} + +/* Completion callback for IO that were issued from this bdev other than read/write. + * They have their own for readability. + */ +static void +_complete_internal_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +/* Completion callback for writes that were issued from this bdev. */ +static void +_complete_internal_write(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx; + + spdk_dma_free(orig_ctx->cry_iov.iov_base); + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +/* Completion callback for reads that were issued from this bdev. */ +static void +_complete_internal_read(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + struct crypto_bdev_io *orig_ctx = (struct crypto_bdev_io *)orig_io->driver_ctx; + + if (success) { + + /* Save off this bdev_io so it can be freed after decryption. */ + orig_ctx->read_io = bdev_io; + + if (_crypto_operation(orig_io, RTE_CRYPTO_CIPHER_OP_DECRYPT)) { + SPDK_ERRLOG("ERROR decrypting"); + spdk_bdev_io_complete(orig_io, SPDK_BDEV_IO_STATUS_FAILED); + spdk_bdev_free_io(bdev_io); + } + } else { + SPDK_ERRLOG("ERROR on read prior to decrypting"); + spdk_bdev_io_complete(orig_io, SPDK_BDEV_IO_STATUS_FAILED); + spdk_bdev_free_io(bdev_io); + } +} + +/* Callback for getting a buf from the bdev pool in the event that the caller passed + * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module + * beneath us before we're done with it. + */ +static void +crypto_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch); + int rc; + + rc = spdk_bdev_readv_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _complete_internal_read, + bdev_io); + if (rc != 0) { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Called when someone submits IO to this crypto vbdev. For IO's not relevant to crypto, + * we're simply passing it on here via SPDK IO calls which in turn allocate another bdev IO + * and call our cpl callback provided below along with the original bdev_io so that we can + * complete it once this IO completes. For crypto operations, we'll either encrypt it first + * (writes) then call back into bdev to submit it or we'll submit a read and then catch it + * on the way back for decryption. + */ +static void +vbdev_crypto_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_crypto *crypto_bdev = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_crypto, + crypto_bdev); + struct crypto_io_channel *crypto_ch = spdk_io_channel_get_ctx(ch); + struct crypto_bdev_io *io_ctx = (struct crypto_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + + memset(io_ctx, 0, sizeof(struct crypto_bdev_io)); + io_ctx->crypto_bdev = crypto_bdev; + io_ctx->crypto_ch = crypto_ch; + io_ctx->orig_io = bdev_io; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, crypto_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + rc = _crypto_operation(bdev_io, RTE_CRYPTO_CIPHER_OP_ENCRYPT); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(crypto_bdev->base_desc, crypto_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(crypto_bdev->base_desc, crypto_ch->base_ch, + _complete_internal_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + default: + SPDK_ERRLOG("crypto: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + if (rc != 0) { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* We'll just call the base bdev and let it answer except for WZ command which + * we always say we don't support so that the bdev layer will actually send us + * real writes that we can encrypt. + */ +static bool +vbdev_crypto_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_FLUSH: + return spdk_bdev_io_type_supported(crypto_bdev->base_bdev, io_type); + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* Force the bdev layer to issue actual writes of zeroes so we can + * encrypt them as regular writes. + */ + default: + return false; + } +} + +/* Called after we've unregistered following a hot remove callback. + * Our finish entry point will be called next. + */ +static int +vbdev_crypto_destruct(void *ctx) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(crypto_bdev->base_bdev); + + /* Close the underlying bdev. */ + spdk_bdev_close(crypto_bdev->base_desc); + + /* Done with this crypto_bdev. */ + TAILQ_REMOVE(&g_vbdev_crypto, crypto_bdev, link); + free(crypto_bdev->drv_name); + free(crypto_bdev->key); + free(crypto_bdev->crypto_bdev.name); + free(crypto_bdev); + return 0; +} + +/* We supplied this as an entry point for upper layers who want to communicate to this + * bdev. This is how they get a channel. We are passed the same context we provided when + * we created our crypto vbdev in examine() which, for this bdev, is the address of one of + * our context nodes. From here we'll ask the SPDK channel code to fill out our channel + * struct and we'll keep it in our crypto node. + */ +static struct spdk_io_channel * +vbdev_crypto_get_io_channel(void *ctx) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + /* The IO channel code will allocate a channel for us which consists of + * the SPDK cahnnel structure plus the size of our crypto_io_channel struct + * that we passed in when we registered our IO device. It will then call + * our channel create callback to populate any elements that we need to + * update. + */ + return spdk_get_io_channel(crypto_bdev); +} + +/* This is the output for get_bdevs() for this vbdev */ +static int +vbdev_crypto_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct vbdev_crypto *crypto_bdev = (struct vbdev_crypto *)ctx; + + spdk_json_write_name(w, "crypto"); + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev)); + spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name); + spdk_json_write_named_string(w, "key", crypto_bdev->key); + spdk_json_write_object_end(w); + return 0; +} + +static int +vbdev_crypto_config_json(struct spdk_json_write_ctx *w) +{ + struct vbdev_crypto *crypto_bdev, *tmp; + + TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_crypto_bdev"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(crypto_bdev->base_bdev)); + spdk_json_write_named_string(w, "name", spdk_bdev_get_name(&crypto_bdev->crypto_bdev)); + spdk_json_write_named_string(w, "crypto_pmd", crypto_bdev->drv_name); + spdk_json_write_named_string(w, "key", crypto_bdev->key); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + } + return 0; +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. We also register the + * poller used to complete crypto operations from the device. + */ +static int +crypto_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct crypto_io_channel *crypto_ch = ctx_buf; + struct vbdev_crypto *crypto_bdev = io_device; + struct device_qp *device_qp; + + crypto_ch->base_ch = spdk_bdev_get_io_channel(crypto_bdev->base_desc); + crypto_ch->poller = spdk_poller_register(crypto_dev_poller, crypto_ch, 0); + crypto_ch->device_qp = NULL; + + pthread_mutex_lock(&g_device_qp_lock); + TAILQ_FOREACH(device_qp, &g_device_qp, link) { + if ((strcmp(device_qp->device->cdev_info.driver_name, crypto_bdev->drv_name) == 0) && + (device_qp->in_use == false)) { + crypto_ch->device_qp = device_qp; + device_qp->in_use = true; + SPDK_NOTICELOG("Device queue pair assignment: ch %p device %p qpid %u %s\n", + crypto_ch, device_qp->device, crypto_ch->device_qp->qp, crypto_bdev->drv_name); + break; + } + } + pthread_mutex_unlock(&g_device_qp_lock); + assert(crypto_ch->device_qp); + return 0; +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. + */ +static void +crypto_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct crypto_io_channel *crypto_ch = ctx_buf; + + pthread_mutex_lock(&g_device_qp_lock); + crypto_ch->device_qp->in_use = false; + pthread_mutex_unlock(&g_device_qp_lock); + + spdk_poller_unregister(&crypto_ch->poller); + spdk_put_io_channel(crypto_ch->base_ch); +} + +/* Create the association from the bdev and vbdev name and insert + * on the global list. */ +static int +vbdev_crypto_insert_name(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key) +{ + struct bdev_names *name; + int rc, j; + bool found = false; + + name = calloc(1, sizeof(struct bdev_names)); + if (!name) { + SPDK_ERRLOG("could not allocate bdev_names\n"); + return -ENOMEM; + } + + name->bdev_name = strdup(bdev_name); + if (!name->bdev_name) { + SPDK_ERRLOG("could not allocate name->bdev_name\n"); + rc = -ENOMEM; + goto error_alloc_bname; + } + + name->vbdev_name = strdup(vbdev_name); + if (!name->vbdev_name) { + SPDK_ERRLOG("could not allocate name->vbdev_name\n"); + rc = -ENOMEM; + goto error_alloc_vname; + } + + name->drv_name = strdup(crypto_pmd); + if (!name->drv_name) { + SPDK_ERRLOG("could not allocate name->drv_name\n"); + rc = -ENOMEM; + goto error_alloc_dname; + } + for (j = 0; j < MAX_NUM_DRV_TYPES ; j++) { + if (strcmp(crypto_pmd, g_driver_names[j]) == 0) { + found = true; + break; + } + } + if (!found) { + SPDK_ERRLOG("invalid crypto PMD type %s\n", crypto_pmd); + rc = -EINVAL; + goto error_invalid_pmd; + } + + name->key = strdup(key); + if (!name->key) { + SPDK_ERRLOG("could not allocate name->key\n"); + rc = -ENOMEM; + goto error_alloc_key; + } + if (strlen(name->key) != AES_CBC_KEY_LENGTH) { + SPDK_ERRLOG("invalid AES_CCB key length\n"); + rc = -EINVAL; + goto error_invalid_key; + } + + TAILQ_INSERT_TAIL(&g_bdev_names, name, link); + + return 0; + + /* Error cleanup paths. */ +error_invalid_key: +error_alloc_key: +error_invalid_pmd: + free(name->drv_name); +error_alloc_dname: + free(name->vbdev_name); +error_alloc_vname: + free(name->bdev_name); +error_alloc_bname: + free(name); + return rc; +} + +/* RPC entry point for crypto creation. */ +int +create_crypto_disk(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key) +{ + struct spdk_bdev *bdev = NULL; + struct vbdev_crypto *crypto_bdev, *tmp; + int rc = 0; + + bdev = spdk_bdev_get_by_name(bdev_name); + + rc = vbdev_crypto_insert_name(bdev_name, vbdev_name, crypto_pmd, key); + if (rc) { + return rc; + } + + if (!bdev) { + return 0; + } + + rc = vbdev_crypto_claim(bdev); + if (rc) { + return rc; + } + + rc = vbdev_crypto_init_crypto_drivers(); + if (rc) { + return rc; + } + + TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) { + if (strcmp(crypto_bdev->base_bdev->name, bdev->name) == 0) { + rc = spdk_vbdev_register(&crypto_bdev->crypto_bdev, + &crypto_bdev->base_bdev, 1); + if (rc) { + SPDK_ERRLOG("could not register crypto_bdev\n"); + spdk_bdev_close(crypto_bdev->base_desc); + TAILQ_REMOVE(&g_vbdev_crypto, crypto_bdev, link); + free(crypto_bdev->crypto_bdev.name); + free(crypto_bdev->key); + free(crypto_bdev); + } + break; + } + } + + return rc; +} + +/* Called at driver init time, parses config file to preapre for examine calls, + * also fully initializes the crypto drivers. + */ +static int +vbdev_crypto_init(void) +{ + struct spdk_conf_section *sp = NULL; + const char *conf_bdev_name = NULL; + const char *conf_vbdev_name = NULL; + const char *crypto_pmd = NULL; + int i; + int rc = 0; + const char *key = NULL; + + /* Fully configure both SW and HW drivers. */ + rc = vbdev_crypto_init_crypto_drivers(); + if (rc) { + SPDK_ERRLOG("Error setting up crypto devices\n"); + return rc; + } + + sp = spdk_conf_find_section(NULL, "crypto"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + + if (!spdk_conf_section_get_nval(sp, "CRY", i)) { + break; + } + + conf_bdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 0); + if (!conf_bdev_name) { + SPDK_ERRLOG("crypto configuration missing bdev name\n"); + return -EINVAL; + } + + conf_vbdev_name = spdk_conf_section_get_nmval(sp, "CRY", i, 1); + if (!conf_vbdev_name) { + SPDK_ERRLOG("crypto configuration missing crypto_bdev name\n"); + return -EINVAL; + } + + key = spdk_conf_section_get_nmval(sp, "CRY", i, 2); + if (!key) { + SPDK_ERRLOG("crypto configuration missing crypto_bdev key\n"); + return -EINVAL; + } + SPDK_NOTICELOG("WARNING: You are storing your key in a plain text file!!\n"); + + crypto_pmd = spdk_conf_section_get_nmval(sp, "CRY", i, 3); + if (!crypto_pmd) { + SPDK_ERRLOG("crypto configuration missing driver type\n"); + return -EINVAL; + } + + rc = vbdev_crypto_insert_name(conf_bdev_name, conf_vbdev_name, + crypto_pmd, key); + if (rc != 0) { + return rc; + } + } + + return rc; +} + +/* Called when the entire module is being torn down. */ +static void +vbdev_crypto_finish(void) +{ + struct bdev_names *name; + struct vbdev_dev *device; + struct device_qp *dev_qp; + + while ((name = TAILQ_FIRST(&g_bdev_names))) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->drv_name); + free(name->key); + free(name->bdev_name); + free(name->vbdev_name); + free(name); + } + + while ((device = TAILQ_FIRST(&g_vbdev_devs))) { + TAILQ_REMOVE(&g_vbdev_devs, device, link); + rte_cryptodev_stop(device->cdev_id); + free(device); + } + + while ((dev_qp = TAILQ_FIRST(&g_device_qp))) { + TAILQ_REMOVE(&g_device_qp, dev_qp, link); + free(dev_qp); + } + + rte_mempool_free(g_crypto_op_mp); + spdk_mempool_free(g_mbuf_mp); + spdk_mempool_free(g_session_mp); +} + +/* During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_crypto_get_ctx_size(void) +{ + return sizeof(struct crypto_bdev_io); +} + +/* Called when SPDK wants to save the current config of this vbdev module to + * a file. + */ +static void +vbdev_crypto_get_spdk_running_config(FILE *fp) +{ + struct bdev_names *names = NULL; + fprintf(fp, "\n[crypto]\n"); + TAILQ_FOREACH(names, &g_bdev_names, link) { + fprintf(fp, " crypto %s %s ", names->bdev_name, names->vbdev_name); + fprintf(fp, "\n"); + } + + fprintf(fp, "\n"); +} + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_crypto_examine_hotremove_cb(void *ctx) +{ + struct vbdev_crypto *crypto_bdev, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) { + if (bdev_find == crypto_bdev->base_bdev) { + spdk_bdev_unregister(&crypto_bdev->crypto_bdev, NULL, NULL); + } + } +} + +static void +vbdev_crypto_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_crypto_fn_table = { + .destruct = vbdev_crypto_destruct, + .submit_request = vbdev_crypto_submit_request, + .io_type_supported = vbdev_crypto_io_type_supported, + .get_io_channel = vbdev_crypto_get_io_channel, + .dump_info_json = vbdev_crypto_dump_info_json, + .write_config_json = vbdev_crypto_write_config_json +}; + +static struct spdk_bdev_module crypto_if = { + .name = "crypto", + .module_init = vbdev_crypto_init, + .config_text = vbdev_crypto_get_spdk_running_config, + .get_ctx_size = vbdev_crypto_get_ctx_size, + .examine_config = vbdev_crypto_examine, + .module_fini = vbdev_crypto_finish, + .config_json = vbdev_crypto_config_json +}; + +SPDK_BDEV_MODULE_REGISTER(&crypto_if) + +static int +vbdev_crypto_claim(struct spdk_bdev *bdev) +{ + struct bdev_names *name; + struct vbdev_crypto *vbdev; + int rc = 0; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the crypto_bdev & bdev accordingly. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->bdev_name, bdev->name) != 0) { + continue; + } + + SPDK_NOTICELOG("Match on %s\n", bdev->name); + vbdev = calloc(1, sizeof(struct vbdev_crypto)); + if (!vbdev) { + SPDK_ERRLOG("could not allocate crypto_bdev\n"); + rc = -ENOMEM; + goto error_vbdev_alloc; + } + + /* The base bdev that we're attaching to. */ + vbdev->base_bdev = bdev; + vbdev->crypto_bdev.name = strdup(name->vbdev_name); + if (!vbdev->crypto_bdev.name) { + SPDK_ERRLOG("could not allocate crypto_bdev name\n"); + rc = -ENOMEM; + goto error_bdev_name; + } + + vbdev->key = strdup(name->key); + if (!vbdev->key) { + SPDK_ERRLOG("could not allocate crypto_bdev key\n"); + rc = -ENOMEM; + goto error_alloc_key; + } + + vbdev->drv_name = strdup(name->drv_name); + if (!vbdev->drv_name) { + SPDK_ERRLOG("could not allocate crypto_bdev drv_name\n"); + rc = -ENOMEM; + goto error_drv_name; + } + + vbdev->crypto_bdev.product_name = "crypto"; + vbdev->crypto_bdev.write_cache = bdev->write_cache; + vbdev->crypto_bdev.need_aligned_buffer = bdev->need_aligned_buffer; + /* Note: CRYPTO_MAX_IO is in units of bytes, optimal_io_boundary is + * in units of blocks. + */ + if (bdev->optimal_io_boundary > 0) { + vbdev->crypto_bdev.optimal_io_boundary = + spdk_min((CRYPTO_MAX_IO / bdev->blocklen), bdev->optimal_io_boundary); + } else { + vbdev->crypto_bdev.optimal_io_boundary = (CRYPTO_MAX_IO / bdev->blocklen); + } + vbdev->crypto_bdev.split_on_optimal_io_boundary = true; + vbdev->crypto_bdev.blocklen = bdev->blocklen; + vbdev->crypto_bdev.blockcnt = bdev->blockcnt; + + /* This is the context that is passed to us when the bdev + * layer calls in so we'll save our crypto_bdev node here. + */ + vbdev->crypto_bdev.ctxt = vbdev; + vbdev->crypto_bdev.fn_table = &vbdev_crypto_fn_table; + vbdev->crypto_bdev.module = &crypto_if; + TAILQ_INSERT_TAIL(&g_vbdev_crypto, vbdev, link); + + spdk_io_device_register(vbdev, crypto_bdev_ch_create_cb, crypto_bdev_ch_destroy_cb, + sizeof(struct crypto_io_channel), vbdev->crypto_bdev.name); + + rc = spdk_bdev_open(bdev, true, vbdev_crypto_examine_hotremove_cb, + bdev, &vbdev->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_open; + } + + rc = spdk_bdev_module_claim_bdev(bdev, vbdev->base_desc, vbdev->crypto_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev)); + goto error_claim; + } + + SPDK_NOTICELOG("registered crypto_bdev for: %s\n", name->vbdev_name); + } + + return rc; + + /* Error cleanup paths. */ +error_claim: + spdk_bdev_close(vbdev->base_desc); +error_open: + TAILQ_REMOVE(&g_vbdev_crypto, vbdev, link); + spdk_io_device_unregister(vbdev, NULL); + free(vbdev->drv_name); +error_drv_name: + free(vbdev->key); +error_alloc_key: + free(vbdev->crypto_bdev.name); +error_bdev_name: + free(vbdev); +error_vbdev_alloc: + return rc; +} + +/* RPC entry for deleting a crypto vbdev. */ +void +delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn, + void *cb_arg) +{ + struct bdev_names *name; + + if (!bdev || bdev->module != &crypto_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the + * vbdev does not get re-created if the same bdev is constructed at some other time, + * unless the underlying bdev was hot-removed. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->vbdev_name, bdev->name) == 0) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name->drv_name); + free(name->key); + free(name); + break; + } + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +/* Because we specified this function in our crypto bdev function table when we + * registered our crypto bdev, we'll get this call anytime a new bdev shows up. + * Here we need to decide if we care about it and if so what to do. We + * parsed the config file at init so we check the new bdev against the list + * we built up at that time and if the user configured us to attach to this + * bdev, here's where we do it. + */ +static void +vbdev_crypto_examine(struct spdk_bdev *bdev) +{ + struct vbdev_crypto *crypto_bdev, *tmp; + int rc; + + vbdev_crypto_claim(bdev); + + TAILQ_FOREACH_SAFE(crypto_bdev, &g_vbdev_crypto, link, tmp) { + if (strcmp(crypto_bdev->base_bdev->name, bdev->name) == 0) { + rc = spdk_vbdev_register(&crypto_bdev->crypto_bdev, + &crypto_bdev->base_bdev, 1); + if (rc) { + SPDK_ERRLOG("could not register crypto_bdev\n"); + spdk_bdev_close(crypto_bdev->base_desc); + TAILQ_REMOVE(&g_vbdev_crypto, crypto_bdev, link); + free(crypto_bdev->crypto_bdev.name); + free(crypto_bdev->key); + free(crypto_bdev); + } + break; + } + } + + spdk_bdev_module_examine_done(&crypto_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_crypto", SPDK_LOG_VBDEV_crypto) diff --git a/src/spdk/lib/bdev/crypto/vbdev_crypto.h b/src/spdk/lib/bdev/crypto/vbdev_crypto.h new file mode 100644 index 00000000..c8ef8d16 --- /dev/null +++ b/src/spdk/lib/bdev/crypto/vbdev_crypto.h @@ -0,0 +1,66 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_CRYPTO_H +#define SPDK_VBDEV_CRYPTO_H + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_crypto_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new crypto bdev. + * + * \param bdev_name Bdev on which crypto vbdev will be created. + * \param bdev_name Vbdev name crypto_pmd key + * \return 0 on success, other on failure. + */ +int create_crypto_disk(const char *bdev_name, const char *vbdev_name, + const char *crypto_pmd, const char *key); + +/** + * Delete crypto bdev. + * + * \param bdev Pointer to crypto bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_crypto_disk(struct spdk_bdev *bdev, spdk_delete_crypto_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_VBDEV_CRYPTO_H */ diff --git a/src/spdk/lib/bdev/crypto/vbdev_crypto_rpc.c b/src/spdk/lib/bdev/crypto/vbdev_crypto_rpc.c new file mode 100644 index 00000000..cbf5a3b8 --- /dev/null +++ b/src/spdk/lib/bdev/crypto/vbdev_crypto_rpc.c @@ -0,0 +1,163 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_crypto.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_construct_crypto { + char *base_bdev_name; + char *name; + char *crypto_pmd; + char *key; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_construct_crypto(struct rpc_construct_crypto *r) +{ + free(r->base_bdev_name); + free(r->name); + free(r->crypto_pmd); + free(r->key); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_construct_crypto_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_construct_crypto, base_bdev_name), spdk_json_decode_string}, + {"name", offsetof(struct rpc_construct_crypto, name), spdk_json_decode_string}, + {"crypto_pmd", offsetof(struct rpc_construct_crypto, crypto_pmd), spdk_json_decode_string}, + {"key", offsetof(struct rpc_construct_crypto, key), spdk_json_decode_string}, +}; + +/* Decode the parameters for this RPC method and properly construct the crypto + * device. Error status returned in the failed cases. + */ +static void +spdk_rpc_construct_crypto_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_crypto req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_crypto_decoders, + SPDK_COUNTOF(rpc_construct_crypto_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_crypto, "spdk_json_decode_object failed\n"); + goto invalid; + } + + rc = create_crypto_disk(req.base_bdev_name, req.name, + req.crypto_pmd, req.key); + if (rc != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_crypto(&req); + return; + } + + spdk_json_write_string(w, req.name); + spdk_jsonrpc_end_result(request, w); + free_rpc_construct_crypto(&req); + return; + +invalid: + free_rpc_construct_crypto(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("construct_crypto_bdev", spdk_rpc_construct_crypto_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_crypto { + char *name; +}; + +static void +free_rpc_delete_crypto(struct rpc_delete_crypto *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_crypto_decoders[] = { + {"name", offsetof(struct rpc_delete_crypto, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_crypto_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_crypto_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_crypto req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_crypto_decoders, + SPDK_COUNTOF(rpc_delete_crypto_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + delete_crypto_disk(bdev, _spdk_rpc_delete_crypto_bdev_cb, request); + + free_rpc_delete_crypto(&req); + + return; + +invalid: + free_rpc_delete_crypto(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_crypto_bdev", spdk_rpc_delete_crypto_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/error/Makefile b/src/spdk/lib/bdev/error/Makefile new file mode 100644 index 00000000..9dcee8bd --- /dev/null +++ b/src/spdk/lib/bdev/error/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = vbdev_error.c vbdev_error_rpc.c +LIBNAME = vbdev_error + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/error/vbdev_error.c b/src/spdk/lib/bdev/error/vbdev_error.c new file mode 100644 index 00000000..4bab9426 --- /dev/null +++ b/src/spdk/lib/bdev/error/vbdev_error.c @@ -0,0 +1,513 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a module for test purpose which will simulate error cases for bdev. + */ + +#include "spdk/stdinc.h" +#include "spdk/rpc.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/endian.h" +#include "spdk/nvme_spec.h" +#include "spdk/string.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "vbdev_error.h" + +struct spdk_vbdev_error_config { + char *base_bdev; + TAILQ_ENTRY(spdk_vbdev_error_config) tailq; +}; + +static TAILQ_HEAD(, spdk_vbdev_error_config) g_error_config + = TAILQ_HEAD_INITIALIZER(g_error_config); + +struct vbdev_error_info { + bool enabled; + uint32_t error_type; + uint32_t error_num; +}; + +/* Context for each error bdev */ +struct error_disk { + struct spdk_bdev_part part; + struct vbdev_error_info error_vector[SPDK_BDEV_IO_TYPE_RESET]; + TAILQ_HEAD(, spdk_bdev_io) pending_ios; +}; + +struct error_channel { + struct spdk_bdev_part_channel part_ch; +}; + +static pthread_mutex_t g_vbdev_error_mutex = PTHREAD_MUTEX_INITIALIZER; +static SPDK_BDEV_PART_TAILQ g_error_disks = TAILQ_HEAD_INITIALIZER(g_error_disks); + +static int vbdev_error_init(void); +static void vbdev_error_fini(void); + +static void vbdev_error_examine(struct spdk_bdev *bdev); +static int vbdev_error_config_json(struct spdk_json_write_ctx *w); + +static int vbdev_error_config_add(const char *base_bdev_name); +static int vbdev_error_config_remove(const char *base_bdev_name); + +static struct spdk_bdev_module error_if = { + .name = "error", + .module_init = vbdev_error_init, + .module_fini = vbdev_error_fini, + .examine_config = vbdev_error_examine, + .config_json = vbdev_error_config_json, + +}; + +SPDK_BDEV_MODULE_REGISTER(&error_if) + +int +spdk_vbdev_inject_error(char *name, uint32_t io_type, uint32_t error_type, uint32_t error_num) +{ + struct spdk_bdev *bdev; + struct spdk_bdev_part *part; + struct error_disk *error_disk = NULL; + uint32_t i; + + pthread_mutex_lock(&g_vbdev_error_mutex); + bdev = spdk_bdev_get_by_name(name); + if (!bdev) { + SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name); + pthread_mutex_unlock(&g_vbdev_error_mutex); + return -1; + } + + TAILQ_FOREACH(part, &g_error_disks, tailq) { + if (bdev == spdk_bdev_part_get_bdev(part)) { + error_disk = (struct error_disk *)part; + break; + } + } + + if (error_disk == NULL) { + SPDK_ERRLOG("Could not find ErrorInjection bdev %s\n", name); + pthread_mutex_unlock(&g_vbdev_error_mutex); + return -1; + } + + if (0xffffffff == io_type) { + for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) { + error_disk->error_vector[i].enabled = true; + error_disk->error_vector[i].error_type = error_type; + error_disk->error_vector[i].error_num = error_num; + } + } else if (0 == io_type) { + for (i = 0; i < SPDK_COUNTOF(error_disk->error_vector); i++) { + error_disk->error_vector[i].enabled = false; + error_disk->error_vector[i].error_num = 0; + } + } else { + error_disk->error_vector[io_type].enabled = true; + error_disk->error_vector[io_type].error_type = error_type; + error_disk->error_vector[io_type].error_num = error_num; + } + pthread_mutex_unlock(&g_vbdev_error_mutex); + return 0; +} + +static void +vbdev_error_reset(struct error_disk *error_disk, struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_io *pending_io, *tmp; + + TAILQ_FOREACH_SAFE(pending_io, &error_disk->pending_ios, module_link, tmp) { + TAILQ_REMOVE(&error_disk->pending_ios, pending_io, module_link); + spdk_bdev_io_complete(pending_io, SPDK_BDEV_IO_STATUS_FAILED); + } + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static uint32_t +vbdev_error_get_error_type(struct error_disk *error_disk, uint32_t io_type) +{ + if (error_disk->error_vector[io_type].enabled && + error_disk->error_vector[io_type].error_num) { + return error_disk->error_vector[io_type].error_type; + } + return 0; +} + +static void +vbdev_error_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct error_channel *ch = spdk_io_channel_get_ctx(_ch); + struct error_disk *error_disk = bdev_io->bdev->ctxt; + uint32_t error_type; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_FLUSH: + break; + case SPDK_BDEV_IO_TYPE_RESET: + vbdev_error_reset(error_disk, bdev_io); + return; + default: + SPDK_ERRLOG("Error Injection: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + error_type = vbdev_error_get_error_type(error_disk, bdev_io->type); + if (error_type == 0) { + int rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + + if (rc) { + SPDK_ERRLOG("bdev_error: submit request failed, rc=%d\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + return; + } else if (error_type == VBDEV_IO_FAILURE) { + error_disk->error_vector[bdev_io->type].error_num--; + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else if (error_type == VBDEV_IO_PENDING) { + TAILQ_INSERT_TAIL(&error_disk->pending_ios, bdev_io, module_link); + error_disk->error_vector[bdev_io->type].error_num--; + } +} + +static int +vbdev_error_destruct(void *ctx) +{ + struct error_disk *error_disk = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part); + int rc; + + rc = vbdev_error_config_remove(base_bdev->name); + if (rc != 0) { + SPDK_ERRLOG("vbdev_error_config_remove() failed\n"); + } + + return spdk_bdev_part_free(&error_disk->part); +} + +static int +vbdev_error_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct error_disk *error_disk = ctx; + struct spdk_bdev *base_bdev = spdk_bdev_part_get_base_bdev(&error_disk->part); + + spdk_json_write_name(w, "error_disk"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "base_bdev"); + spdk_json_write_string(w, base_bdev->name); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +vbdev_error_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev. */ +} + + +static struct spdk_bdev_fn_table vbdev_error_fn_table = { + .destruct = vbdev_error_destruct, + .submit_request = vbdev_error_submit_request, + .dump_info_json = vbdev_error_dump_info_json, + .write_config_json = vbdev_error_write_config_json +}; + +static void +spdk_vbdev_error_base_bdev_hotremove_cb(void *_base_bdev) +{ + spdk_bdev_part_base_hotremove(_base_bdev, &g_error_disks); +} + +static int +_spdk_vbdev_error_create(struct spdk_bdev *base_bdev) +{ + struct spdk_bdev_part_base *base = NULL; + struct error_disk *disk = NULL; + char *name; + int rc; + + base = spdk_bdev_part_base_construct(base_bdev, + spdk_vbdev_error_base_bdev_hotremove_cb, + &error_if, &vbdev_error_fn_table, &g_error_disks, + NULL, NULL, sizeof(struct error_channel), + NULL, NULL); + if (!base) { + SPDK_ERRLOG("could not construct part base for bdev %s\n", spdk_bdev_get_name(base_bdev)); + return -ENOMEM; + } + + disk = calloc(1, sizeof(*disk)); + if (!disk) { + SPDK_ERRLOG("Memory allocation failure\n"); + spdk_bdev_part_base_free(base); + return -ENOMEM; + } + + name = spdk_sprintf_alloc("EE_%s", spdk_bdev_get_name(base_bdev)); + if (!name) { + SPDK_ERRLOG("name allocation failure\n"); + spdk_bdev_part_base_free(base); + free(disk); + return -ENOMEM; + } + + rc = spdk_bdev_part_construct(&disk->part, base, name, 0, base_bdev->blockcnt, + "Error Injection Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct part for bdev %s\n", spdk_bdev_get_name(base_bdev)); + /* spdk_bdev_part_construct will free name on failure */ + spdk_bdev_part_base_free(base); + free(disk); + return rc; + } + + TAILQ_INIT(&disk->pending_ios); + + return 0; +} + +int +spdk_vbdev_error_create(const char *base_bdev_name) +{ + int rc; + struct spdk_bdev *base_bdev; + + rc = vbdev_error_config_add(base_bdev_name); + if (rc != 0) { + SPDK_ERRLOG("Adding config for ErrorInjection bdev %s failed (rc=%d)\n", + base_bdev_name, rc); + return rc; + } + + base_bdev = spdk_bdev_get_by_name(base_bdev_name); + if (!base_bdev) { + return 0; + } + + rc = _spdk_vbdev_error_create(base_bdev); + if (rc != 0) { + vbdev_error_config_remove(base_bdev_name); + SPDK_ERRLOG("Could not create ErrorInjection bdev %s (rc=%d)\n", + base_bdev_name, rc); + } + + return rc; +} + +void +spdk_vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn, void *cb_arg) +{ + if (!vbdev || vbdev->module != &error_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(vbdev, cb_fn, cb_arg); +} + +static void +vbdev_error_clear_config(void) +{ + struct spdk_vbdev_error_config *cfg; + + while ((cfg = TAILQ_FIRST(&g_error_config))) { + TAILQ_REMOVE(&g_error_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); + } +} + +static struct spdk_vbdev_error_config * +vbdev_error_config_find_by_base_name(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + TAILQ_FOREACH(cfg, &g_error_config, tailq) { + if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { + return cfg; + } + } + + return NULL; +} + +static int +vbdev_error_config_add(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + cfg = vbdev_error_config_find_by_base_name(base_bdev_name); + if (cfg) { + SPDK_ERRLOG("vbdev_error_config for bdev %s already exists\n", + base_bdev_name); + return -EEXIST; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc() failed for vbdev_error_config\n"); + return -ENOMEM; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + free(cfg); + SPDK_ERRLOG("strdup() failed for base_bdev_name\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq); + + return 0; +} + +static int +vbdev_error_config_remove(const char *base_bdev_name) +{ + struct spdk_vbdev_error_config *cfg; + + cfg = vbdev_error_config_find_by_base_name(base_bdev_name); + if (!cfg) { + return -ENOENT; + } + + TAILQ_REMOVE(&g_error_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); + return 0; +} + +static int +vbdev_error_init(void) +{ + struct spdk_conf_section *sp; + struct spdk_vbdev_error_config *cfg; + const char *base_bdev_name; + int i, rc; + + sp = spdk_conf_find_section(NULL, "BdevError"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "BdevError", i)) { + break; + } + + base_bdev_name = spdk_conf_section_get_nmval(sp, "BdevError", i, 0); + if (!base_bdev_name) { + SPDK_ERRLOG("ErrorInjection configuration missing bdev name\n"); + rc = -EINVAL; + goto error; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc() failed for vbdev_error_config\n"); + rc = -ENOMEM; + goto error; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + free(cfg); + SPDK_ERRLOG("strdup() failed for bdev name\n"); + rc = -ENOMEM; + goto error; + } + + TAILQ_INSERT_TAIL(&g_error_config, cfg, tailq); + } + + return 0; + +error: + vbdev_error_clear_config(); + return rc; +} + +static void +vbdev_error_fini(void) +{ + vbdev_error_clear_config(); +} + +static void +vbdev_error_examine(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_error_config *cfg; + int rc; + + cfg = vbdev_error_config_find_by_base_name(bdev->name); + if (cfg != NULL) { + rc = _spdk_vbdev_error_create(bdev); + if (rc != 0) { + SPDK_ERRLOG("could not create error vbdev for bdev %s at examine\n", + bdev->name); + } + } + + spdk_bdev_module_examine_done(&error_if); +} + +static int +vbdev_error_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vbdev_error_config *cfg; + + TAILQ_FOREACH(cfg, &g_error_config, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_error_bdev"); + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_name", cfg->base_bdev); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + return 0; +} diff --git a/src/spdk/lib/bdev/error/vbdev_error.h b/src/spdk/lib/bdev/error/vbdev_error.h new file mode 100644 index 00000000..4ff1ac19 --- /dev/null +++ b/src/spdk/lib/bdev/error/vbdev_error.h @@ -0,0 +1,76 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_ERROR_H +#define SPDK_VBDEV_ERROR_H + +#include "spdk/stdinc.h" +#include "spdk/bdev.h" + +enum vbdev_error_type { + VBDEV_IO_FAILURE = 1, + VBDEV_IO_PENDING, +}; + +typedef void (*spdk_delete_error_complete)(void *cb_arg, int bdeverrno); + +/** + * Create a vbdev on the base bdev to inject error into it. + * + * \param base_bdev_name Name of the base bdev. + * \return 0 on success or negative on failure. + */ +int spdk_vbdev_error_create(const char *base_bdev_name); + +/** + * Delete vbdev used to inject errors. + * + * \param bdev Pointer to error vbdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Arguments to pass to cb_fn. + */ +void spdk_vbdev_error_delete(struct spdk_bdev *vbdev, spdk_delete_error_complete cb_fn, + void *cb_arg); + +/** + * Inject error to the base bdev. Users can specify which IO type error is injected, + * what type of error is injected, and how many errors are injected. + * + * \param name Name of the base bdev into which error is injected. + * \param io_type IO type into which error is injected. + * \param error_num Count of injected errors + */ +int spdk_vbdev_inject_error(char *name, uint32_t io_type, uint32_t error_type, + uint32_t error_num); + +#endif // SPDK_VBDEV_ERROR_H diff --git a/src/spdk/lib/bdev/error/vbdev_error_rpc.c b/src/spdk/lib/bdev/error/vbdev_error_rpc.c new file mode 100644 index 00000000..8d95fd09 --- /dev/null +++ b/src/spdk/lib/bdev/error/vbdev_error_rpc.c @@ -0,0 +1,258 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "vbdev_error.h" + +#define ERROR_BDEV_IO_TYPE_INVALID (SPDK_BDEV_IO_TYPE_RESET + 1) +#define ERROR_BDEV_ERROR_TYPE_INVALID (VBDEV_IO_PENDING + 1) + +static uint32_t +spdk_rpc_error_bdev_io_type_parse(char *name) +{ + if (strcmp(name, "read") == 0) { + return SPDK_BDEV_IO_TYPE_READ; + } else if (strcmp(name, "write") == 0) { + return SPDK_BDEV_IO_TYPE_WRITE; + } else if (strcmp(name, "flush") == 0) { + return SPDK_BDEV_IO_TYPE_FLUSH; + } else if (strcmp(name, "unmap") == 0) { + return SPDK_BDEV_IO_TYPE_UNMAP; + } else if (strcmp(name, "all") == 0) { + return 0xffffffff; + } else if (strcmp(name, "clear") == 0) { + return 0; + } + return ERROR_BDEV_IO_TYPE_INVALID; +} + +static uint32_t +spdk_rpc_error_bdev_error_type_parse(char *name) +{ + if (strcmp(name, "failure") == 0) { + return VBDEV_IO_FAILURE; + } else if (strcmp(name, "pending") == 0) { + return VBDEV_IO_PENDING; + } + return ERROR_BDEV_ERROR_TYPE_INVALID; +} + +struct rpc_construct_error_bdev { + char *base_name; +}; + +static void +free_rpc_construct_error_bdev(struct rpc_construct_error_bdev *req) +{ + free(req->base_name); +} + +static const struct spdk_json_object_decoder rpc_construct_error_bdev_decoders[] = { + {"base_name", offsetof(struct rpc_construct_error_bdev, base_name), spdk_json_decode_string}, +}; + +static void +spdk_rpc_construct_error_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_error_bdev req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_construct_error_bdev_decoders, + SPDK_COUNTOF(rpc_construct_error_bdev_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (spdk_vbdev_error_create(req.base_name)) { + SPDK_ERRLOG("Could not create ErrorInjection bdev %s\n", req.base_name); + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_error_bdev(&req); + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + + free_rpc_construct_error_bdev(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_construct_error_bdev(&req); +} +SPDK_RPC_REGISTER("construct_error_bdev", spdk_rpc_construct_error_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_error { + char *name; +}; + +static void +free_rpc_delete_error(struct rpc_delete_error *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_error_decoders[] = { + {"name", offsetof(struct rpc_delete_error, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_error_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_error_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_error req = {NULL}; + struct spdk_bdev *vbdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_error_decoders, + SPDK_COUNTOF(rpc_delete_error_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + vbdev = spdk_bdev_get_by_name(req.name); + if (vbdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + spdk_vbdev_error_delete(vbdev, _spdk_rpc_delete_error_bdev_cb, request); + + free_rpc_delete_error(&req); + + return; + +invalid: + free_rpc_delete_error(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_error_bdev", spdk_rpc_delete_error_bdev, SPDK_RPC_RUNTIME) + +struct rpc_error_information { + char *name; + char *io_type; + char *error_type; + uint32_t num; +}; + +static const struct spdk_json_object_decoder rpc_error_information_decoders[] = { + {"name", offsetof(struct rpc_error_information, name), spdk_json_decode_string}, + {"io_type", offsetof(struct rpc_error_information, io_type), spdk_json_decode_string}, + {"error_type", offsetof(struct rpc_error_information, error_type), spdk_json_decode_string}, + {"num", offsetof(struct rpc_error_information, num), spdk_json_decode_uint32, true}, +}; + +static void +free_rpc_error_information(struct rpc_error_information *p) +{ + free(p->name); + free(p->io_type); + free(p->error_type); +} + +static void +spdk_rpc_bdev_inject_error(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_error_information req = {}; + struct spdk_json_write_ctx *w; + uint32_t io_type; + uint32_t error_type; + int ret; + + if (spdk_json_decode_object(params, rpc_error_information_decoders, + SPDK_COUNTOF(rpc_error_information_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + io_type = spdk_rpc_error_bdev_io_type_parse(req.io_type); + if (io_type == ERROR_BDEV_IO_TYPE_INVALID) { + goto invalid; + } + + error_type = spdk_rpc_error_bdev_error_type_parse(req.error_type); + if (error_type == ERROR_BDEV_ERROR_TYPE_INVALID) { + goto invalid; + } + + ret = spdk_vbdev_inject_error(req.name, io_type, error_type, req.num); + if (ret) { + goto invalid; + } + + free_rpc_error_information(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_error_information(&req); +} +SPDK_RPC_REGISTER("bdev_inject_error", spdk_rpc_bdev_inject_error, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/gpt/Makefile b/src/spdk/lib/bdev/gpt/Makefile new file mode 100644 index 00000000..6806c647 --- /dev/null +++ b/src/spdk/lib/bdev/gpt/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = gpt.c vbdev_gpt.c +LIBNAME = vbdev_gpt + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/gpt/gpt.c b/src/spdk/lib/bdev/gpt/gpt.c new file mode 100644 index 00000000..0e830cdd --- /dev/null +++ b/src/spdk/lib/bdev/gpt/gpt.c @@ -0,0 +1,239 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gpt.h" + +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/event.h" + +#include "spdk_internal/log.h" + +#define GPT_PRIMARY_PARTITION_TABLE_LBA 0x1 +#define PRIMARY_PARTITION_NUMBER 4 +#define GPT_PROTECTIVE_MBR 1 +#define SPDK_MAX_NUM_PARTITION_ENTRIES 128 + +static int +spdk_gpt_read_partitions(struct spdk_gpt *gpt) +{ + uint32_t total_partition_size, num_partition_entries, partition_entry_size; + uint64_t partition_start_lba; + struct spdk_gpt_header *head = gpt->header; + uint32_t crc32; + + num_partition_entries = from_le32(&head->num_partition_entries); + if (num_partition_entries > SPDK_MAX_NUM_PARTITION_ENTRIES) { + SPDK_ERRLOG("Num_partition_entries=%u which exceeds max=%u\n", + num_partition_entries, SPDK_MAX_NUM_PARTITION_ENTRIES); + return -1; + } + + partition_entry_size = from_le32(&head->size_of_partition_entry); + if (partition_entry_size != sizeof(struct spdk_gpt_partition_entry)) { + SPDK_ERRLOG("Partition_entry_size(%x) != expected(%lx)\n", + partition_entry_size, sizeof(struct spdk_gpt_partition_entry)); + return -1; + } + + total_partition_size = num_partition_entries * partition_entry_size; + partition_start_lba = from_le64(&head->partition_entry_lba); + if ((total_partition_size + partition_start_lba * gpt->sector_size) > SPDK_GPT_BUFFER_SIZE) { + SPDK_ERRLOG("Buffer size is not enough\n"); + return -1; + } + + gpt->partitions = (struct spdk_gpt_partition_entry *)(gpt->buf + + partition_start_lba * gpt->sector_size); + + crc32 = spdk_crc32_ieee_update(gpt->partitions, total_partition_size, ~0); + crc32 ^= ~0; + + if (crc32 != from_le32(&head->partition_entry_array_crc32)) { + SPDK_ERRLOG("GPT partition entry array crc32 did not match\n"); + return -1; + } + + return 0; +} + +static int +spdk_gpt_lba_range_check(struct spdk_gpt_header *head, uint64_t lba_end) +{ + uint64_t usable_lba_start, usable_lba_end; + + usable_lba_start = from_le64(&head->first_usable_lba); + usable_lba_end = from_le64(&head->last_usable_lba); + + if (usable_lba_end < usable_lba_start) { + SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") < usable_lba_start(%" PRIu64 ")\n", + usable_lba_end, usable_lba_start); + return -1; + } + + if (usable_lba_end > lba_end) { + SPDK_ERRLOG("Head's usable_lba_end(%" PRIu64 ") > lba_end(%" PRIu64 ")\n", + usable_lba_end, lba_end); + return -1; + } + + if ((usable_lba_start < GPT_PRIMARY_PARTITION_TABLE_LBA) && + (GPT_PRIMARY_PARTITION_TABLE_LBA < usable_lba_end)) { + SPDK_ERRLOG("Head lba is not in the usable range\n"); + return -1; + } + + return 0; +} + +static int +spdk_gpt_read_header(struct spdk_gpt *gpt) +{ + uint32_t head_size; + uint32_t new_crc, original_crc; + struct spdk_gpt_header *head; + + head = (struct spdk_gpt_header *)(gpt->buf + GPT_PRIMARY_PARTITION_TABLE_LBA * gpt->sector_size); + head_size = from_le32(&head->header_size); + if (head_size < sizeof(*head) || head_size > gpt->sector_size) { + SPDK_ERRLOG("head_size=%u\n", head_size); + return -1; + } + + original_crc = from_le32(&head->header_crc32); + head->header_crc32 = 0; + new_crc = spdk_crc32_ieee_update(head, from_le32(&head->header_size), ~0); + new_crc ^= ~0; + /* restore header crc32 */ + to_le32(&head->header_crc32, original_crc); + + if (new_crc != original_crc) { + SPDK_ERRLOG("head crc32 does not match, provided=%u, caculated=%u\n", + original_crc, new_crc); + return -1; + } + + if (memcmp(SPDK_GPT_SIGNATURE, head->gpt_signature, + sizeof(head->gpt_signature))) { + SPDK_ERRLOG("signature did not match\n"); + return -1; + } + + if (spdk_gpt_lba_range_check(head, gpt->lba_end)) { + SPDK_ERRLOG("lba range check error\n"); + return -1; + } + + gpt->header = head; + return 0; +} + +static int +spdk_gpt_check_mbr(struct spdk_gpt *gpt) +{ + int i, primary_partition = 0; + uint32_t total_lba_size = 0, ret = 0, expected_start_lba; + struct spdk_mbr *mbr; + + mbr = (struct spdk_mbr *)gpt->buf; + if (from_le16(&mbr->mbr_signature) != SPDK_MBR_SIGNATURE) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Signature mismatch, provided=%x," + "expected=%x\n", from_le16(&mbr->disk_signature), + SPDK_MBR_SIGNATURE); + return -1; + } + + for (i = 0; i < PRIMARY_PARTITION_NUMBER; i++) { + if (mbr->partitions[i].os_type == SPDK_MBR_OS_TYPE_GPT_PROTECTIVE) { + primary_partition = i; + ret = GPT_PROTECTIVE_MBR; + break; + } + } + + if (ret == GPT_PROTECTIVE_MBR) { + expected_start_lba = GPT_PRIMARY_PARTITION_TABLE_LBA; + if (from_le32(&mbr->partitions[primary_partition].start_lba) != expected_start_lba) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "start lba mismatch, provided=%u, expected=%u\n", + from_le32(&mbr->partitions[primary_partition].start_lba), + expected_start_lba); + return -1; + } + + total_lba_size = from_le32(&mbr->partitions[primary_partition].size_lba); + if ((total_lba_size != ((uint32_t) gpt->total_sectors - 1)) && + (total_lba_size != 0xFFFFFFFF)) { + SPDK_ERRLOG("GPT Primary MBR size does not equal: (record_size %u != actual_size %u)!\n", + total_lba_size, (uint32_t) gpt->total_sectors - 1); + return -1; + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Currently only support GPT Protective MBR format\n"); + return -1; + } + + return 0; +} + +int +spdk_gpt_parse(struct spdk_gpt *gpt) +{ + int rc; + + if (!gpt || !gpt->buf) { + SPDK_ERRLOG("Gpt and the related buffer should not be NULL\n"); + return -1; + } + + rc = spdk_gpt_check_mbr(gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_GPT_PARSE, "Failed to detect gpt in MBR\n"); + return rc; + } + + rc = spdk_gpt_read_header(gpt); + if (rc) { + SPDK_ERRLOG("Failed to read gpt header\n"); + return rc; + } + + rc = spdk_gpt_read_partitions(gpt); + if (rc) { + SPDK_ERRLOG("Failed to read gpt partitions\n"); + return rc; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("gpt_parse", SPDK_LOG_GPT_PARSE) diff --git a/src/spdk/lib/bdev/gpt/gpt.h b/src/spdk/lib/bdev/gpt/gpt.h new file mode 100644 index 00000000..923bdc1c --- /dev/null +++ b/src/spdk/lib/bdev/gpt/gpt.h @@ -0,0 +1,62 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * GPT internal Interface + */ + +#ifndef SPDK_INTERNAL_GPT_H +#define SPDK_INTERNAL_GPT_H + +#include "spdk/stdinc.h" + +#include "spdk/gpt_spec.h" + +#define SPDK_GPT_PART_TYPE_GUID SPDK_GPT_GUID(0x7c5222bd, 0x8f5d, 0x4087, 0x9c00, 0xbf9843c7b58c) +#define SPDK_GPT_BUFFER_SIZE 32768 /* 32KB */ +#define SPDK_GPT_GUID_EQUAL(x,y) (memcmp(x, y, sizeof(struct spdk_gpt_guid)) == 0) + +struct spdk_gpt { + unsigned char *buf; + uint64_t buf_size; + uint64_t lba_start; + uint64_t lba_end; + uint64_t total_sectors; + uint32_t sector_size; + struct spdk_gpt_header *header; + struct spdk_gpt_partition_entry *partitions; +}; + +int spdk_gpt_parse(struct spdk_gpt *gpt); + +#endif /* SPDK_INTERNAL_GPT_H */ diff --git a/src/spdk/lib/bdev/gpt/vbdev_gpt.c b/src/spdk/lib/bdev/gpt/vbdev_gpt.c new file mode 100644 index 00000000..751af0ea --- /dev/null +++ b/src/spdk/lib/bdev/gpt/vbdev_gpt.c @@ -0,0 +1,463 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This driver reads a GPT partition table from a bdev and exposes a virtual block device for + * each partition. + */ + +#include "gpt.h" + +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static int vbdev_gpt_init(void); +static void vbdev_gpt_examine(struct spdk_bdev *bdev); +static int vbdev_gpt_get_ctx_size(void); + +static struct spdk_bdev_module gpt_if = { + .name = "gpt", + .module_init = vbdev_gpt_init, + .get_ctx_size = vbdev_gpt_get_ctx_size, + .examine_disk = vbdev_gpt_examine, + +}; +SPDK_BDEV_MODULE_REGISTER(&gpt_if) + +/* Base block device gpt context */ +struct gpt_base { + struct spdk_gpt gpt; + struct spdk_bdev_part_base *part_base; + + /* This channel is only used for reading the partition table. */ + struct spdk_io_channel *ch; +}; + +/* Context for each gpt virtual bdev */ +struct gpt_disk { + struct spdk_bdev_part part; + uint32_t partition_index; +}; + +struct gpt_channel { + struct spdk_bdev_part_channel part_ch; +}; + +struct gpt_io { + struct spdk_io_channel *ch; + struct spdk_bdev_io *bdev_io; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static SPDK_BDEV_PART_TAILQ g_gpt_disks = TAILQ_HEAD_INITIALIZER(g_gpt_disks); + +static bool g_gpt_disabled; + +static void +spdk_gpt_base_free(void *ctx) +{ + struct gpt_base *gpt_base = ctx; + + spdk_dma_free(gpt_base->gpt.buf); + free(gpt_base); +} + +static void +spdk_gpt_base_bdev_hotremove_cb(void *_base_bdev) +{ + spdk_bdev_part_base_hotremove(_base_bdev, &g_gpt_disks); +} + +static int vbdev_gpt_destruct(void *ctx); +static void vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); +static int vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w); + +static struct spdk_bdev_fn_table vbdev_gpt_fn_table = { + .destruct = vbdev_gpt_destruct, + .submit_request = vbdev_gpt_submit_request, + .dump_info_json = vbdev_gpt_dump_info_json, +}; + +static struct gpt_base * +spdk_gpt_base_bdev_init(struct spdk_bdev *bdev) +{ + struct gpt_base *gpt_base; + struct spdk_gpt *gpt; + + gpt_base = calloc(1, sizeof(*gpt_base)); + if (!gpt_base) { + SPDK_ERRLOG("Cannot alloc memory for gpt_base pointer\n"); + return NULL; + } + + gpt_base->part_base = spdk_bdev_part_base_construct(bdev, + spdk_gpt_base_bdev_hotremove_cb, + &gpt_if, &vbdev_gpt_fn_table, + &g_gpt_disks, spdk_gpt_base_free, gpt_base, + sizeof(struct gpt_channel), NULL, NULL); + if (!gpt_base->part_base) { + free(gpt_base); + SPDK_ERRLOG("cannot construct gpt_base"); + return NULL; + } + + gpt = &gpt_base->gpt; + gpt->buf_size = spdk_max(SPDK_GPT_BUFFER_SIZE, bdev->blocklen); + gpt->buf = spdk_dma_zmalloc(gpt->buf_size, spdk_bdev_get_buf_align(bdev), NULL); + if (!gpt->buf) { + SPDK_ERRLOG("Cannot alloc buf\n"); + spdk_bdev_part_base_free(gpt_base->part_base); + return NULL; + } + + gpt->sector_size = bdev->blocklen; + gpt->total_sectors = bdev->blockcnt; + gpt->lba_start = 0; + gpt->lba_end = gpt->total_sectors - 1; + + return gpt_base; +} + +static int +vbdev_gpt_destruct(void *ctx) +{ + struct gpt_disk *gpt_disk = ctx; + + return spdk_bdev_part_free(&gpt_disk->part); +} + +static void +vbdev_gpt_resubmit_request(void *arg) +{ + struct gpt_io *io = (struct gpt_io *)arg; + + vbdev_gpt_submit_request(io->ch, io->bdev_io); +} + +static void +vbdev_gpt_queue_io(struct gpt_io *io) +{ + int rc; + + io->bdev_io_wait.bdev = io->bdev_io->bdev; + io->bdev_io_wait.cb_fn = vbdev_gpt_resubmit_request; + io->bdev_io_wait.cb_arg = io; + + rc = spdk_bdev_queue_io_wait(io->bdev_io->bdev, + io->ch, &io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_gpt_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +vbdev_gpt_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct gpt_channel *ch = spdk_io_channel_get_ctx(_ch); + struct gpt_io *io = (struct gpt_io *)bdev_io->driver_ctx; + int rc; + + rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "gpt: no memory, queue io\n"); + io->ch = _ch; + io->bdev_io = bdev_io; + vbdev_gpt_queue_io(io); + } else { + SPDK_ERRLOG("gpt: error on bdev_io submission, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static void +write_guid(struct spdk_json_write_ctx *w, const struct spdk_gpt_guid *guid) +{ + spdk_json_write_string_fmt(w, "%08x-%04x-%04x-%04x-%04x%08x", + from_le32(&guid->raw[0]), + from_le16(&guid->raw[4]), + from_le16(&guid->raw[6]), + from_be16(&guid->raw[8]), + from_be16(&guid->raw[10]), + from_be32(&guid->raw[12])); +} + +static void +write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *str, size_t max_len) +{ + size_t len; + const uint16_t *p; + + for (len = 0, p = str; len < max_len && *p; p++) { + len++; + } + + spdk_json_write_string_utf16le_raw(w, str, len); +} + +static int +vbdev_gpt_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct gpt_disk *gpt_disk = SPDK_CONTAINEROF(ctx, struct gpt_disk, part); + struct spdk_bdev_part_base *base_bdev = spdk_bdev_part_get_base(&gpt_disk->part); + struct gpt_base *gpt_base = spdk_bdev_part_base_get_ctx(base_bdev); + struct spdk_bdev *part_base_bdev = spdk_bdev_part_base_get_bdev(base_bdev); + struct spdk_gpt *gpt = &gpt_base->gpt; + struct spdk_gpt_partition_entry *gpt_entry = &gpt->partitions[gpt_disk->partition_index]; + uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(&gpt_disk->part); + + spdk_json_write_name(w, "gpt"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "base_bdev"); + spdk_json_write_string(w, spdk_bdev_get_name(part_base_bdev)); + + spdk_json_write_name(w, "offset_blocks"); + spdk_json_write_uint64(w, offset_blocks); + + spdk_json_write_name(w, "partition_type_guid"); + write_guid(w, &gpt_entry->part_type_guid); + + spdk_json_write_name(w, "unique_partition_guid"); + write_guid(w, &gpt_entry->unique_partition_guid); + + spdk_json_write_name(w, "partition_name"); + write_string_utf16le(w, gpt_entry->partition_name, SPDK_COUNTOF(gpt_entry->partition_name)); + + spdk_json_write_object_end(w); + + return 0; +} + +static int +vbdev_gpt_create_bdevs(struct gpt_base *gpt_base) +{ + uint32_t num_partition_entries; + uint64_t i, head_lba_start, head_lba_end; + uint32_t num_partitions; + struct spdk_gpt_partition_entry *p; + struct gpt_disk *d; + struct spdk_gpt *gpt; + char *name; + struct spdk_bdev *base_bdev; + int rc; + + gpt = &gpt_base->gpt; + num_partition_entries = from_le32(&gpt->header->num_partition_entries); + head_lba_start = from_le64(&gpt->header->first_usable_lba); + head_lba_end = from_le64(&gpt->header->last_usable_lba); + num_partitions = 0; + + for (i = 0; i < num_partition_entries; i++) { + p = &gpt->partitions[i]; + uint64_t lba_start = from_le64(&p->starting_lba); + uint64_t lba_end = from_le64(&p->ending_lba); + + if (!SPDK_GPT_GUID_EQUAL(&gpt->partitions[i].part_type_guid, + &SPDK_GPT_PART_TYPE_GUID) || + lba_start == 0) { + continue; + } + if (lba_start < head_lba_start || lba_end > head_lba_end) { + continue; + } + + d = calloc(1, sizeof(*d)); + if (!d) { + SPDK_ERRLOG("Memory allocation failure\n"); + return -1; + } + + /* index start at 1 instead of 0 to match the existing style */ + base_bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base); + name = spdk_sprintf_alloc("%sp%" PRIu64, spdk_bdev_get_name(base_bdev), i + 1); + if (!name) { + SPDK_ERRLOG("name allocation failure\n"); + free(d); + return -1; + } + + rc = spdk_bdev_part_construct(&d->part, gpt_base->part_base, name, + lba_start, lba_end - lba_start, "GPT Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct bdev part\n"); + /* spdk_bdev_part_construct will free name on failure */ + free(d); + return -1; + } + num_partitions++; + d->partition_index = i; + } + + return num_partitions; +} + +static void +spdk_gpt_bdev_complete(struct spdk_bdev_io *bdev_io, bool status, void *arg) +{ + struct gpt_base *gpt_base = (struct gpt_base *)arg; + struct spdk_bdev *bdev = spdk_bdev_part_base_get_bdev(gpt_base->part_base); + int rc, num_partitions = 0; + + spdk_bdev_free_io(bdev_io); + spdk_put_io_channel(gpt_base->ch); + gpt_base->ch = NULL; + + if (status != SPDK_BDEV_IO_STATUS_SUCCESS) { + SPDK_ERRLOG("Gpt: bdev=%s io error status=%d\n", + spdk_bdev_get_name(bdev), status); + goto end; + } + + rc = spdk_gpt_parse(&gpt_base->gpt); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to parse gpt\n"); + goto end; + } + + num_partitions = vbdev_gpt_create_bdevs(gpt_base); + if (num_partitions < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_GPT, "Failed to split dev=%s by gpt table\n", + spdk_bdev_get_name(bdev)); + } + +end: + /* + * Notify the generic bdev layer that the actions related to the original examine + * callback are now completed. + */ + spdk_bdev_module_examine_done(&gpt_if); + + /* + * vbdev_gpt_create_bdevs returns the number of bdevs created upon success. + * We can branch on this value. + */ + if (num_partitions <= 0) { + /* If no gpt_disk instances were created, free the base context */ + spdk_bdev_part_base_free(gpt_base->part_base); + } +} + +static int +vbdev_gpt_read_gpt(struct spdk_bdev *bdev) +{ + struct gpt_base *gpt_base; + struct spdk_bdev_desc *part_base_desc; + int rc; + + gpt_base = spdk_gpt_base_bdev_init(bdev); + if (!gpt_base) { + SPDK_ERRLOG("Cannot allocated gpt_base\n"); + return -1; + } + + part_base_desc = spdk_bdev_part_base_get_desc(gpt_base->part_base); + gpt_base->ch = spdk_bdev_get_io_channel(part_base_desc); + if (gpt_base->ch == NULL) { + SPDK_ERRLOG("Failed to get an io_channel.\n"); + spdk_bdev_part_base_free(gpt_base->part_base); + return -1; + } + + rc = spdk_bdev_read(part_base_desc, gpt_base->ch, gpt_base->gpt.buf, 0, + gpt_base->gpt.buf_size, spdk_gpt_bdev_complete, gpt_base); + if (rc < 0) { + spdk_put_io_channel(gpt_base->ch); + spdk_bdev_part_base_free(gpt_base->part_base); + SPDK_ERRLOG("Failed to send bdev_io command\n"); + return -1; + } + + return 0; +} + +static int +vbdev_gpt_init(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Gpt"); + + if (sp && spdk_conf_section_get_boolval(sp, "Disable", false)) { + /* Disable Gpt probe */ + g_gpt_disabled = true; + } + + return 0; +} + +static int +vbdev_gpt_get_ctx_size(void) +{ + return sizeof(struct gpt_io); +} + +static void +vbdev_gpt_examine(struct spdk_bdev *bdev) +{ + int rc; + + /* A bdev with fewer than 2 blocks cannot have a GPT. Block 0 has + * the MBR and block 1 has the GPT header. + */ + if (g_gpt_disabled || spdk_bdev_get_num_blocks(bdev) < 2) { + spdk_bdev_module_examine_done(&gpt_if); + return; + } + + if (spdk_bdev_get_block_size(bdev) % 512 != 0) { + SPDK_ERRLOG("GPT module does not support block size %" PRIu32 " for bdev %s\n", + spdk_bdev_get_block_size(bdev), spdk_bdev_get_name(bdev)); + spdk_bdev_module_examine_done(&gpt_if); + return; + } + + rc = vbdev_gpt_read_gpt(bdev); + if (rc) { + spdk_bdev_module_examine_done(&gpt_if); + SPDK_ERRLOG("Failed to read info from bdev %s\n", spdk_bdev_get_name(bdev)); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_gpt", SPDK_LOG_VBDEV_GPT) diff --git a/src/spdk/lib/bdev/iscsi/Makefile b/src/spdk/lib/bdev/iscsi/Makefile new file mode 100644 index 00000000..4a38886d --- /dev/null +++ b/src/spdk/lib/bdev/iscsi/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +# CentOS 7 libiscsi package has functions declared inline but not +# defined in the header file. Not aware of any way to disable +# this warning so just make sure the warning isn't treated as +# an error. +CFLAGS += -Wno-error +C_SRCS = bdev_iscsi.c bdev_iscsi_rpc.c +LIBNAME = bdev_iscsi + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/iscsi/bdev_iscsi.c b/src/spdk/lib/bdev/iscsi/bdev_iscsi.c new file mode 100644 index 00000000..528337f5 --- /dev/null +++ b/src/spdk/lib/bdev/iscsi/bdev_iscsi.c @@ -0,0 +1,875 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/fd.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/iscsi_spec.h" + +#include "spdk_internal/log.h" +#include "spdk/bdev_module.h" + +#include "iscsi/iscsi.h" +#include "iscsi/scsi-lowlevel.h" + +#include "bdev_iscsi.h" + +struct bdev_iscsi_lun; + +#define BDEV_ISCSI_CONNECTION_POLL_US 500 /* 0.5 ms */ +#define BDEV_ISCSI_NO_MASTER_CH_POLL_US 10000 /* 10ms */ + +#define DEFAULT_INITIATOR_NAME "iqn.2016-06.io.spdk:init" + +static int bdev_iscsi_initialize(void); +static TAILQ_HEAD(, bdev_iscsi_conn_req) g_iscsi_conn_req = TAILQ_HEAD_INITIALIZER( + g_iscsi_conn_req); +static struct spdk_poller *g_conn_poller = NULL; + +struct bdev_iscsi_io { + struct spdk_thread *submit_td; + enum spdk_bdev_io_status status; + int scsi_status; + enum spdk_scsi_sense sk; + uint8_t asc; + uint8_t ascq; +}; + +struct bdev_iscsi_lun { + struct spdk_bdev bdev; + struct iscsi_context *context; + char *initiator_iqn; + char *url; + pthread_mutex_t mutex; + uint32_t ch_count; + struct bdev_iscsi_io_channel *master_ch; + struct spdk_thread *master_td; + struct spdk_poller *no_master_ch_poller; + struct spdk_thread *no_master_ch_poller_td; + bool unmap_supported; +}; + +struct bdev_iscsi_io_channel { + struct spdk_poller *poller; + struct bdev_iscsi_lun *lun; +}; + +struct bdev_iscsi_conn_req { + char *url; + char *bdev_name; + char *initiator_iqn; + struct iscsi_context *context; + spdk_bdev_iscsi_create_cb create_cb; + spdk_bdev_iscsi_create_cb create_cb_arg; + bool unmap_supported; + TAILQ_ENTRY(bdev_iscsi_conn_req) link; +}; + +static void +complete_conn_req(struct bdev_iscsi_conn_req *req, struct spdk_bdev *bdev, + int status) +{ + TAILQ_REMOVE(&g_iscsi_conn_req, req, link); + req->create_cb(req->create_cb_arg, bdev, status); + if (status) { + /* if the request failed and no iscsi lun was + * created then we could not hand over this + * memory and have to free it manually now. + */ + iscsi_destroy_context(req->context); + free(req->initiator_iqn); + free(req->bdev_name); + free(req->url); + } + free(req); +} + +static int +bdev_iscsi_get_ctx_size(void) +{ + return sizeof(struct bdev_iscsi_io); +} + +static void +_iscsi_free_lun(void *arg) +{ + struct bdev_iscsi_lun *lun = arg; + + assert(lun != NULL); + iscsi_destroy_context(lun->context); + pthread_mutex_destroy(&lun->mutex); + free(lun->bdev.name); + free(lun->url); + free(lun->initiator_iqn); + + spdk_bdev_destruct_done(&lun->bdev, 0); + free(lun); +} + +static void +bdev_iscsi_finish(void) +{ + struct bdev_iscsi_conn_req *req; + + while (!TAILQ_EMPTY(&g_iscsi_conn_req)) { + req = TAILQ_FIRST(&g_iscsi_conn_req); + complete_conn_req(req, NULL, -EINTR); + } + + if (g_conn_poller) { + spdk_poller_unregister(&g_conn_poller); + } +} + +static struct spdk_bdev_module g_iscsi_bdev_module = { + .name = "iscsi", + .module_init = bdev_iscsi_initialize, + .module_fini = bdev_iscsi_finish, + .get_ctx_size = bdev_iscsi_get_ctx_size, + .async_init = true, +}; + +SPDK_BDEV_MODULE_REGISTER(&g_iscsi_bdev_module); + +static void +_bdev_iscsi_io_complete(void *_iscsi_io) +{ + struct bdev_iscsi_io *iscsi_io = _iscsi_io; + + if (iscsi_io->status == SPDK_BDEV_IO_STATUS_SUCCESS) { + spdk_bdev_io_complete_scsi_status(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->scsi_status, + iscsi_io->sk, iscsi_io->asc, iscsi_io->ascq); + } else { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(iscsi_io), iscsi_io->status); + } +} + +static void +bdev_iscsi_io_complete(struct bdev_iscsi_io *iscsi_io, enum spdk_bdev_io_status status) +{ + iscsi_io->status = status; + if (iscsi_io->submit_td != NULL) { + spdk_thread_send_msg(iscsi_io->submit_td, _bdev_iscsi_io_complete, iscsi_io); + } else { + _bdev_iscsi_io_complete(iscsi_io); + } +} + +/* Common call back function for read/write/flush command */ +static void +bdev_iscsi_command_cb(struct iscsi_context *context, int status, void *_task, void *_iscsi_io) +{ + struct scsi_task *task = _task; + struct bdev_iscsi_io *iscsi_io = _iscsi_io; + + iscsi_io->scsi_status = status; + iscsi_io->sk = (uint8_t)task->sense.key; + iscsi_io->asc = (task->sense.ascq >> 8) & 0xFF; + iscsi_io->ascq = task->sense.ascq & 0xFF; + + scsi_free_scsi_task(task); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS); +} + +static void +bdev_iscsi_readv(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba) +{ + struct scsi_task *task; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "read %d iovs size %lu to lba: %#lx\n", + iovcnt, nbytes, lba); + + task = iscsi_read16_task(lun->context, 0, lba, nbytes, lun->bdev.blocklen, 0, 0, 0, 0, 0, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get read16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_in(task, (struct scsi_iovec *)iov, iovcnt); +#else + int i; + for (i = 0; i < iovcnt; i++) { + scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base); + } +#endif +} + +static void +bdev_iscsi_writev(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + struct iovec *iov, int iovcnt, uint64_t nbytes, uint64_t lba) +{ + struct scsi_task *task; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI_INIT, "write %d iovs size %lu to lba: %#lx\n", + iovcnt, nbytes, lba); + + task = iscsi_write16_task(lun->context, 0, lba, NULL, nbytes, lun->bdev.blocklen, 0, 0, 0, 0, 0, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get write16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + +#if defined(LIBISCSI_FEATURE_IOVECTOR) + scsi_task_set_iov_out(task, (struct scsi_iovec *)iov, iovcnt); +#else + int i; + for (i = 0; i < iovcnt; i++) { + scsi_task_add_data_in_buffer(task, iov[i].iov_len, iov[i].iov_base); + } +#endif +} + +static void +bdev_iscsi_destruct_cb(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + spdk_poller_unregister(&lun->no_master_ch_poller); + spdk_io_device_unregister(lun, _iscsi_free_lun); +} + +static int +bdev_iscsi_destruct(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + assert(lun->no_master_ch_poller_td); + spdk_thread_send_msg(lun->no_master_ch_poller_td, bdev_iscsi_destruct_cb, lun); + return 1; +} + +static void +bdev_iscsi_flush(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, uint32_t num_blocks, + int immed, uint64_t lba) +{ + struct scsi_task *task; + + task = iscsi_synchronizecache16_task(lun->context, 0, lba, + num_blocks, 0, immed, bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get sync16_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_unmap(struct bdev_iscsi_lun *lun, struct bdev_iscsi_io *iscsi_io, + uint64_t lba, uint64_t num_blocks) +{ + struct scsi_task *task; + struct unmap_list list[1]; + + list[0].lba = lba; + list[0].num = num_blocks; + task = iscsi_unmap_task(lun->context, 0, 0, 0, list, 1, + bdev_iscsi_command_cb, iscsi_io); + if (task == NULL) { + SPDK_ERRLOG("failed to get unmap_task\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_reset_cb(struct iscsi_context *context __attribute__((unused)), int status, + void *command_data, void *private_data) +{ + uint32_t tmf_response; + struct bdev_iscsi_io *iscsi_io = private_data; + + tmf_response = *(uint32_t *)command_data; + if (tmf_response == ISCSI_TASK_FUNC_RESP_COMPLETE) { + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +_bdev_iscsi_reset(void *_bdev_io) +{ + int rc; + struct spdk_bdev_io *bdev_io = _bdev_io; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct iscsi_context *context = lun->context; + + rc = iscsi_task_mgmt_lun_reset_async(context, 0, + bdev_iscsi_reset_cb, iscsi_io); + if (rc != 0) { + SPDK_ERRLOG("failed to do iscsi reset\n"); + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } +} + +static void +bdev_iscsi_reset(struct spdk_bdev_io *bdev_io) +{ + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + spdk_thread_send_msg(lun->master_td, _bdev_iscsi_reset, bdev_io); +} + +static int +bdev_iscsi_poll_lun(struct bdev_iscsi_lun *lun) +{ + struct pollfd pfd = {}; + + pfd.fd = iscsi_get_fd(lun->context); + pfd.events = iscsi_which_events(lun->context); + + if (poll(&pfd, 1, 0) < 0) { + SPDK_ERRLOG("poll failed\n"); + return -1; + } + + if (pfd.revents != 0) { + if (iscsi_service(lun->context, pfd.revents) < 0) { + SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(lun->context)); + } + } + + return -1; +} + +static int +bdev_iscsi_no_master_ch_poll(void *arg) +{ + struct bdev_iscsi_lun *lun = arg; + int rc = 0; + + if (pthread_mutex_trylock(&lun->mutex)) { + /* Don't care about the error code here. */ + return -1; + } + + if (lun->ch_count == 0) { + rc = bdev_iscsi_poll_lun(arg); + } + + pthread_mutex_unlock(&lun->mutex); + return rc; +} + +static int +bdev_iscsi_poll(void *arg) +{ + struct bdev_iscsi_io_channel *ch = arg; + + return bdev_iscsi_poll_lun(ch->lun); +} + +static void bdev_iscsi_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + bdev_iscsi_readv((struct bdev_iscsi_lun *)bdev_io->bdev->ctxt, + (struct bdev_iscsi_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks); +} + +static void _bdev_iscsi_submit_request(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_iscsi_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_iscsi_writev(lun, iscsi_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + bdev_iscsi_flush(lun, iscsi_io, + bdev_io->u.bdev.num_blocks, + ISCSI_IMMEDIATE_DATA_NO, + bdev_io->u.bdev.offset_blocks); + break; + case SPDK_BDEV_IO_TYPE_RESET: + bdev_iscsi_reset(bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + bdev_iscsi_unmap(lun, iscsi_io, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + break; + default: + bdev_iscsi_io_complete(iscsi_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +static void bdev_iscsi_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_thread *submit_td = spdk_io_channel_get_thread(_ch); + struct bdev_iscsi_io *iscsi_io = (struct bdev_iscsi_io *)bdev_io->driver_ctx; + struct bdev_iscsi_lun *lun = (struct bdev_iscsi_lun *)bdev_io->bdev->ctxt; + + if (lun->master_td != submit_td) { + iscsi_io->submit_td = submit_td; + spdk_thread_send_msg(lun->master_td, _bdev_iscsi_submit_request, bdev_io); + return; + } else { + iscsi_io->submit_td = NULL; + } + + _bdev_iscsi_submit_request(bdev_io); +} + +static bool +bdev_iscsi_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct bdev_iscsi_lun *lun = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + case SPDK_BDEV_IO_TYPE_UNMAP: + return lun->unmap_supported; + default: + return false; + } +} + +static int +bdev_iscsi_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_iscsi_io_channel *ch = ctx_buf; + struct bdev_iscsi_lun *lun = io_device; + + pthread_mutex_lock(&lun->mutex); + if (lun->ch_count == 0) { + assert(lun->master_ch == NULL); + assert(lun->master_td == NULL); + lun->master_ch = ch; + lun->master_td = spdk_get_thread(); + ch->poller = spdk_poller_register(bdev_iscsi_poll, ch, 0); + ch->lun = lun; + } + lun->ch_count++; + pthread_mutex_unlock(&lun->mutex); + + return 0; +} + +static void +bdev_iscsi_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_iscsi_io_channel *io_channel = ctx_buf; + struct bdev_iscsi_lun *lun = io_device; + + pthread_mutex_lock(&lun->mutex); + lun->ch_count--; + if (lun->ch_count == 0) { + assert(lun->master_ch != NULL); + assert(lun->master_td != NULL); + assert(lun->master_td == spdk_get_thread()); + + lun->master_ch = NULL; + lun->master_td = NULL; + spdk_poller_unregister(&io_channel->poller); + } + pthread_mutex_unlock(&lun->mutex); +} + +static struct spdk_io_channel * +bdev_iscsi_get_io_channel(void *ctx) +{ + struct bdev_iscsi_lun *lun = ctx; + + return spdk_get_io_channel(lun); +} + +static int +bdev_iscsi_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_iscsi_lun *lun = ctx; + + spdk_json_write_name(w, "iscsi"); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "initiator_name"); + spdk_json_write_string(w, lun->initiator_iqn); + spdk_json_write_name(w, "url"); + spdk_json_write_string(w, lun->url); + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_iscsi_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct bdev_iscsi_lun *lun = bdev->ctxt; + + pthread_mutex_lock(&lun->mutex); + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_iscsi_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "initiator_iqn", lun->initiator_iqn); + spdk_json_write_named_string(w, "url", lun->url); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + pthread_mutex_unlock(&lun->mutex); +} + +static const struct spdk_bdev_fn_table iscsi_fn_table = { + .destruct = bdev_iscsi_destruct, + .submit_request = bdev_iscsi_submit_request, + .io_type_supported = bdev_iscsi_io_type_supported, + .get_io_channel = bdev_iscsi_get_io_channel, + .dump_info_json = bdev_iscsi_dump_info_json, + .write_config_json = bdev_iscsi_write_config_json, +}; + +static int +create_iscsi_lun(struct iscsi_context *context, char *url, char *initiator_iqn, char *name, + uint64_t num_blocks, uint32_t block_size, struct spdk_bdev **bdev, bool unmap_supported) +{ + struct bdev_iscsi_lun *lun; + int rc; + + lun = calloc(sizeof(*lun), 1); + if (!lun) { + SPDK_ERRLOG("Unable to allocate enough memory for iscsi backend\n"); + return -ENOMEM; + } + + lun->context = context; + lun->url = url; + lun->initiator_iqn = initiator_iqn; + + pthread_mutex_init(&lun->mutex, NULL); + + lun->bdev.name = name; + lun->bdev.product_name = "iSCSI LUN"; + lun->bdev.module = &g_iscsi_bdev_module; + lun->bdev.blocklen = block_size; + lun->bdev.blockcnt = num_blocks; + lun->bdev.ctxt = lun; + lun->unmap_supported = unmap_supported; + + lun->bdev.fn_table = &iscsi_fn_table; + + spdk_io_device_register(lun, bdev_iscsi_create_cb, bdev_iscsi_destroy_cb, + sizeof(struct bdev_iscsi_io_channel), + name); + rc = spdk_bdev_register(&lun->bdev); + if (rc) { + spdk_io_device_unregister(lun, NULL); + pthread_mutex_destroy(&lun->mutex); + free(lun); + return rc; + } + + lun->no_master_ch_poller_td = spdk_get_thread(); + lun->no_master_ch_poller = spdk_poller_register(bdev_iscsi_no_master_ch_poll, lun, + BDEV_ISCSI_NO_MASTER_CH_POLL_US); + + *bdev = &lun->bdev; + return 0; +} + +static void +iscsi_readcapacity16_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *private_data) +{ + struct bdev_iscsi_conn_req *req = private_data; + struct scsi_readcapacity16 *readcap16; + struct spdk_bdev *bdev = NULL; + struct scsi_task *task = command_data; + + if (status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(iscsi)); + goto ret; + } + + readcap16 = scsi_datain_unmarshall(task); + if (!readcap16) { + status = -ENOMEM; + goto ret; + } + + status = create_iscsi_lun(req->context, req->url, req->initiator_iqn, req->bdev_name, + readcap16->returned_lba + 1, readcap16->block_length, &bdev, req->unmap_supported); + if (status) { + SPDK_ERRLOG("Unable to create iscsi bdev: %s (%d)\n", spdk_strerror(-status), status); + } + +ret: + scsi_free_scsi_task(task); + complete_conn_req(req, bdev, status); +} + +static void +bdev_iscsi_inquiry_cb(struct iscsi_context *context, int status, void *_task, void *private_data) +{ + struct scsi_task *task = _task; + struct scsi_inquiry_logical_block_provisioning *lbp_inq = NULL; + struct bdev_iscsi_conn_req *req = private_data; + + if (status == SPDK_SCSI_STATUS_GOOD) { + lbp_inq = scsi_datain_unmarshall(task); + if (lbp_inq != NULL && lbp_inq->lbpu) { + req->unmap_supported = true; + } + } + + task = iscsi_readcapacity16_task(context, 0, iscsi_readcapacity16_cb, req); + if (task) { + return; + } + + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context)); + complete_conn_req(req, NULL, status); +} + +static void +iscsi_connect_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *private_data) +{ + struct bdev_iscsi_conn_req *req = private_data; + struct scsi_task *task; + + if (status != SPDK_SCSI_STATUS_GOOD) { + goto ret; + } + + task = iscsi_inquiry_task(iscsi, 0, 1, + SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, + 255, bdev_iscsi_inquiry_cb, req); + if (task) { + return; + } + +ret: + SPDK_ERRLOG("iSCSI error: %s\n", iscsi_get_error(req->context)); + complete_conn_req(req, NULL, status); +} + +static int +iscsi_bdev_conn_poll(void *arg) +{ + struct bdev_iscsi_conn_req *req, *tmp; + struct pollfd pfd; + struct iscsi_context *context; + + TAILQ_FOREACH_SAFE(req, &g_iscsi_conn_req, link, tmp) { + context = req->context; + pfd.fd = iscsi_get_fd(context); + pfd.events = iscsi_which_events(context); + pfd.revents = 0; + if (poll(&pfd, 1, 0) < 0) { + SPDK_ERRLOG("poll failed\n"); + return -1; + } + + if (pfd.revents != 0) { + if (iscsi_service(context, pfd.revents) < 0) { + SPDK_ERRLOG("iscsi_service failed: %s\n", iscsi_get_error(context)); + } + } + } + + return -1; +} + +int +create_iscsi_disk(const char *bdev_name, const char *url, const char *initiator_iqn, + spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg) +{ + struct bdev_iscsi_conn_req *req; + struct iscsi_url *iscsi_url = NULL; + int rc; + + if (!bdev_name || !url || !initiator_iqn || strlen(initiator_iqn) == 0 || !cb_fn) { + return -EINVAL; + } + + req = calloc(1, sizeof(struct bdev_iscsi_conn_req)); + if (!req) { + SPDK_ERRLOG("Cannot allocate pointer of struct bdev_iscsi_conn_req\n"); + return -ENOMEM; + } + + req->bdev_name = strdup(bdev_name); + req->url = strdup(url); + req->initiator_iqn = strdup(initiator_iqn); + req->context = iscsi_create_context(initiator_iqn); + if (!req->bdev_name || !req->url || !req->initiator_iqn || !req->context) { + SPDK_ERRLOG("Out of memory\n"); + rc = -ENOMEM; + goto err; + } + + req->create_cb = cb_fn; + req->create_cb_arg = cb_arg; + + iscsi_url = iscsi_parse_full_url(req->context, url); + if (iscsi_url == NULL) { + SPDK_ERRLOG("could not parse URL: %s\n", iscsi_get_error(req->context)); + rc = -EINVAL; + goto err; + } + + rc = iscsi_set_session_type(req->context, ISCSI_SESSION_NORMAL); + rc = rc ? rc : iscsi_set_header_digest(req->context, ISCSI_HEADER_DIGEST_NONE); + rc = rc ? rc : iscsi_set_targetname(req->context, iscsi_url->target); + rc = rc ? rc : iscsi_full_connect_async(req->context, iscsi_url->portal, iscsi_url->lun, + iscsi_connect_cb, req); + if (rc == 0 && iscsi_url->user[0] != '\0') { + rc = iscsi_set_initiator_username_pwd(req->context, iscsi_url->user, iscsi_url->passwd); + } + + if (rc < 0) { + SPDK_ERRLOG("Failed to connect provided URL=%s: %s\n", url, iscsi_get_error(req->context)); + goto err; + } + + iscsi_destroy_url(iscsi_url); + TAILQ_INSERT_TAIL(&g_iscsi_conn_req, req, link); + if (!g_conn_poller) { + g_conn_poller = spdk_poller_register(iscsi_bdev_conn_poll, NULL, BDEV_ISCSI_CONNECTION_POLL_US); + } + + return 0; + +err: + /* iscsi_destroy_url() is not NULL-proof */ + if (iscsi_url) { + iscsi_destroy_url(iscsi_url); + } + + if (req->context) { + iscsi_destroy_context(req->context); + } + + free(req->initiator_iqn); + free(req->bdev_name); + free(req->url); + free(req); + return rc; +} + +void +delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &g_iscsi_bdev_module) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +bdev_iscsi_initialize_cb(void *cb_arg, struct spdk_bdev *bdev, int status) +{ + if (TAILQ_EMPTY(&g_iscsi_conn_req)) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + } +} + +static int +bdev_iscsi_initialize(void) +{ + struct spdk_conf_section *sp; + + const char *url, *bdev_name, *initiator_iqn; + int i, rc; + + sp = spdk_conf_find_section(NULL, "iSCSI_Initiator"); + if (sp == NULL) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + return 0; + } + + initiator_iqn = spdk_conf_section_get_val(sp, "initiator_name"); + if (!initiator_iqn) { + initiator_iqn = DEFAULT_INITIATOR_NAME; + } + + rc = 0; + for (i = 0; (url = spdk_conf_section_get_nmval(sp, "URL", i, 0)) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "URL", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("no bdev name specified for URL %s\n", url); + rc = -EINVAL; + break; + } + + rc = create_iscsi_disk(bdev_name, url, initiator_iqn, bdev_iscsi_initialize_cb, NULL); + if (rc) { + break; + } + } + + if (i == 0) { + spdk_bdev_module_init_done(&g_iscsi_bdev_module); + } + + return rc; +} + +SPDK_LOG_REGISTER_COMPONENT("iscsi_init", SPDK_LOG_ISCSI_INIT) diff --git a/src/spdk/lib/bdev/iscsi/bdev_iscsi.h b/src/spdk/lib/bdev/iscsi/bdev_iscsi.h new file mode 100644 index 00000000..b1d22fa8 --- /dev/null +++ b/src/spdk/lib/bdev/iscsi/bdev_iscsi.h @@ -0,0 +1,75 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_ISCSI_H +#define SPDK_BDEV_ISCSI_H + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_iscsi_complete)(void *cb_arg, int bdeverrno); + +/** + * SPDK bdev iSCSI callback type. + * + * \param cb_arg Completion callback custom arguments + * \param bdev created bdev + * \param status operation status. Zero on success. + */ +typedef void (*spdk_bdev_iscsi_create_cb)(void *cb_arg, struct spdk_bdev *bdev, int status); + +/** + * Create new iSCSI bdev. + * + * \warning iSCSI URL allow providing login and password. Be careful because + * they will show up in configuration dump. + * + * \param name name for new bdev. + * \param initiator_iqn connection iqn name we identify to target as + * \param url iSCSI URL string. + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + * \return 0 on success or negative error code. If success bdev with provided name was created. + */ +int create_iscsi_disk(const char *bdev_name, const char *initiator_iqn, const char *url, + spdk_bdev_iscsi_create_cb cb_fn, void *cb_arg); + +/** + * Delete iSCSI bdev. + * + * \param bdev Pointer to iSCSI bdev. + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void delete_iscsi_disk(struct spdk_bdev *bdev, spdk_delete_iscsi_complete cb_fn, void *cb_arg); + +#endif // SPDK_BDEV_ISCSI_H diff --git a/src/spdk/lib/bdev/iscsi/bdev_iscsi_rpc.c b/src/spdk/lib/bdev/iscsi/bdev_iscsi_rpc.c new file mode 100644 index 00000000..3682b612 --- /dev/null +++ b/src/spdk/lib/bdev/iscsi/bdev_iscsi_rpc.c @@ -0,0 +1,173 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_iscsi.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +struct rpc_construct_iscsi_bdev { + char *name; + char *initiator_iqn; + char *url; +}; + +static const struct spdk_json_object_decoder rpc_construct_iscsi_bdev_decoders[] = { + {"name", offsetof(struct rpc_construct_iscsi_bdev, name), spdk_json_decode_string}, + {"initiator_iqn", offsetof(struct rpc_construct_iscsi_bdev, initiator_iqn), spdk_json_decode_string}, + {"url", offsetof(struct rpc_construct_iscsi_bdev, url), spdk_json_decode_string}, +}; + +static void +free_rpc_construct_iscsi_bdev(struct rpc_construct_iscsi_bdev *req) +{ + free(req->name); + free(req->initiator_iqn); + free(req->url); +} + +static void +construct_iscsi_bdev_cb(void *cb_arg, struct spdk_bdev *bdev, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status > 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "iSCSI error (%d).", status); + } else if (status < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-status)); + } else { + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + } +} + +static void +spdk_rpc_construct_iscsi_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_iscsi_bdev req = {}; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_construct_iscsi_bdev_decoders, + SPDK_COUNTOF(rpc_construct_iscsi_bdev_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = create_iscsi_disk(req.name, req.url, req.initiator_iqn, construct_iscsi_bdev_cb, request); + if (rc) { + goto invalid; + } + + free_rpc_construct_iscsi_bdev(&req); + return; + +invalid: + free_rpc_construct_iscsi_bdev(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_iscsi_bdev", spdk_rpc_construct_iscsi_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_iscsi { + char *name; +}; + +static void +free_rpc_delete_iscsi(struct rpc_delete_iscsi *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_iscsi_decoders[] = { + {"name", offsetof(struct rpc_delete_iscsi, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_iscsi_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_iscsi_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_iscsi req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_iscsi_decoders, + SPDK_COUNTOF(rpc_delete_iscsi_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + delete_iscsi_disk(bdev, _spdk_rpc_delete_iscsi_bdev_cb, request); + + free_rpc_delete_iscsi(&req); + + return; + +invalid: + free_rpc_delete_iscsi(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_iscsi_bdev", spdk_rpc_delete_iscsi_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/lvol/Makefile b/src/spdk/lib/bdev/lvol/Makefile new file mode 100644 index 00000000..569b14cf --- /dev/null +++ b/src/spdk/lib/bdev/lvol/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = vbdev_lvol.c vbdev_lvol_rpc.c +LIBNAME = vbdev_lvol +LOCAL_SYS_LIBS = -luuid + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/lvol/vbdev_lvol.c b/src/spdk/lib/bdev/lvol/vbdev_lvol.c new file mode 100644 index 00000000..74df81e4 --- /dev/null +++ b/src/spdk/lib/bdev/lvol/vbdev_lvol.c @@ -0,0 +1,1321 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/blob_bdev.h" +#include "spdk/rpc.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/uuid.h" + +#include "vbdev_lvol.h" + +static TAILQ_HEAD(, lvol_store_bdev) g_spdk_lvol_pairs = TAILQ_HEAD_INITIALIZER( + g_spdk_lvol_pairs); + +static int vbdev_lvs_init(void); +static int vbdev_lvs_get_ctx_size(void); +static void vbdev_lvs_examine(struct spdk_bdev *bdev); + +static struct spdk_bdev_module g_lvol_if = { + .name = "lvol", + .module_init = vbdev_lvs_init, + .examine_disk = vbdev_lvs_examine, + .get_ctx_size = vbdev_lvs_get_ctx_size, + +}; + +SPDK_BDEV_MODULE_REGISTER(&g_lvol_if) + +struct lvol_store_bdev * +vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs_orig) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (lvs == lvs_orig) { + if (lvs_bdev->req != NULL) { + /* We do not allow access to lvs that are being destroyed */ + return NULL; + } else { + return lvs_bdev; + } + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + + return NULL; +} + +static int +_vbdev_lvol_change_bdev_alias(struct spdk_lvol *lvol, const char *new_lvol_name) +{ + struct spdk_bdev_alias *tmp; + char *old_alias; + char *alias; + int rc; + int alias_number = 0; + + /* bdev representing lvols have only one alias, + * while we changed lvs name earlier, we have to iterate alias list to get one, + * and check if there is only one alias */ + + TAILQ_FOREACH(tmp, &lvol->bdev->aliases, tailq) { + if (++alias_number > 1) { + SPDK_ERRLOG("There is more than 1 alias in bdev %s\n", lvol->bdev->name); + return -EINVAL; + } + + old_alias = tmp->alias; + } + + if (alias_number == 0) { + SPDK_ERRLOG("There are no aliases in bdev %s\n", lvol->bdev->name); + return -EINVAL; + } + + alias = spdk_sprintf_alloc("%s/%s", lvol->lvol_store->name, new_lvol_name); + if (alias == NULL) { + SPDK_ERRLOG("Cannot alloc memory for alias\n"); + return -ENOMEM; + } + + rc = spdk_bdev_alias_add(lvol->bdev, alias); + if (rc != 0) { + SPDK_ERRLOG("cannot add alias '%s'\n", alias); + free(alias); + return rc; + } + free(alias); + + rc = spdk_bdev_alias_del(lvol->bdev, old_alias); + if (rc != 0) { + SPDK_ERRLOG("cannot remove alias '%s'\n", old_alias); + return rc; + } + + return 0; +} + +static struct lvol_store_bdev * +vbdev_get_lvs_bdev_by_bdev(struct spdk_bdev *bdev_orig) +{ + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + if (lvs_bdev->bdev == bdev_orig) { + if (lvs_bdev->req != NULL) { + /* We do not allow access to lvs that are being destroyed */ + return NULL; + } else { + return lvs_bdev; + } + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + + return NULL; +} + +static void +vbdev_lvs_hotremove_cb(void *ctx) +{ + struct spdk_bdev *bdev = ctx; + struct lvol_store_bdev *lvs_bdev; + + lvs_bdev = vbdev_get_lvs_bdev_by_bdev(bdev); + if (lvs_bdev != NULL) { + vbdev_lvs_unload(lvs_bdev->lvs, NULL, NULL); + } +} + +static void +_vbdev_lvs_create_cb(void *cb_arg, struct spdk_lvol_store *lvs, int lvserrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct lvol_store_bdev *lvs_bdev; + struct spdk_bdev *bdev = req->base_bdev; + struct spdk_bs_dev *bs_dev = req->bs_dev; + + if (lvserrno != 0) { + assert(lvs == NULL); + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Cannot create lvol store bdev\n"); + goto end; + } + + lvserrno = spdk_bs_bdev_claim(bs_dev, &g_lvol_if); + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n"); + req->bs_dev->destroy(req->bs_dev); + goto end; + } + + assert(lvs != NULL); + + lvs_bdev = calloc(1, sizeof(*lvs_bdev)); + if (!lvs_bdev) { + lvserrno = -ENOMEM; + goto end; + } + lvs_bdev->lvs = lvs; + lvs_bdev->bdev = bdev; + lvs_bdev->req = NULL; + + TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store bdev inserted\n"); + +end: + req->cb_fn(req->cb_arg, lvs, lvserrno); + free(req); + + return; +} + +int +vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz, + spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_dev *bs_dev; + struct spdk_lvs_with_handle_req *lvs_req; + struct spdk_lvs_opts opts; + int rc; + int len; + + if (base_bdev == NULL) { + SPDK_ERRLOG("Bdev does not exist\n"); + return -ENODEV; + } + + spdk_lvs_opts_init(&opts); + if (cluster_sz != 0) { + opts.cluster_sz = cluster_sz; + } + + if (name == NULL) { + SPDK_ERRLOG("missing name param\n"); + return -EINVAL; + } + + len = strnlen(name, SPDK_LVS_NAME_MAX); + + if (len == 0 || len == SPDK_LVS_NAME_MAX) { + SPDK_ERRLOG("name must be between 1 and %d characters\n", SPDK_LVS_NAME_MAX - 1); + return -EINVAL; + } + snprintf(opts.name, sizeof(opts.name), "%s", name); + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + return -ENOMEM; + } + + bs_dev = spdk_bdev_create_bs_dev(base_bdev, vbdev_lvs_hotremove_cb, base_bdev); + if (!bs_dev) { + SPDK_ERRLOG("Cannot create blobstore device\n"); + free(lvs_req); + return -ENODEV; + } + + lvs_req->bs_dev = bs_dev; + lvs_req->base_bdev = base_bdev; + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + + rc = spdk_lvs_init(bs_dev, &opts, _vbdev_lvs_create_cb, lvs_req); + if (rc < 0) { + free(lvs_req); + bs_dev->destroy(bs_dev); + return rc; + } + + return 0; +} + +static void +_vbdev_lvs_rename_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_req *req = cb_arg; + struct spdk_lvol *tmp; + + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store rename failed\n"); + } else { + TAILQ_FOREACH(tmp, &req->lvol_store->lvols, link) { + /* We have to pass current lvol name, since only lvs name changed */ + _vbdev_lvol_change_bdev_alias(tmp, tmp->name); + } + } + + req->cb_fn(req->cb_arg, lvserrno); + free(req); +} + +void +vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name, + spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + struct lvol_store_bdev *lvs_bdev; + + struct spdk_lvs_req *req; + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol_store = lvs; + + spdk_lvs_rename(lvs, new_lvs_name, _vbdev_lvs_rename_cb, req); +} + +static void +_vbdev_lvs_remove_cb(void *cb_arg, int lvserrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvs_req *req = lvs_bdev->req; + + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Could not remove lvol store bdev\n"); + } else { + TAILQ_REMOVE(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + free(lvs_bdev); + } + + if (req->cb_fn != NULL) { + req->cb_fn(req->cb_arg, lvserrno); + } + free(req); +} + +static void +_vbdev_lvs_remove_lvol_cb(void *cb_arg, int lvolerrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvol_store *lvs = lvs_bdev->lvs; + struct spdk_lvol *lvol; + + if (lvolerrno != 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol removed with errno %d\n", lvolerrno); + } + + if (TAILQ_EMPTY(&lvs->lvols)) { + spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + return; + } + + lvol = TAILQ_FIRST(&lvs->lvols); + while (lvol != NULL) { + if (spdk_lvol_deletable(lvol)) { + vbdev_lvol_destroy(lvol, _vbdev_lvs_remove_lvol_cb, lvs_bdev); + return; + } + lvol = TAILQ_NEXT(lvol, link); + } + + /* If no lvol is deletable, that means there is circular dependency. */ + SPDK_ERRLOG("Lvols left in lvs, but unable to delete.\n"); + assert(false); +} + +static void +_vbdev_lvs_remove_bdev_unregistered_cb(void *cb_arg, int bdeverrno) +{ + struct lvol_store_bdev *lvs_bdev = cb_arg; + struct spdk_lvol_store *lvs = lvs_bdev->lvs; + struct spdk_lvol *lvol, *tmp; + + if (bdeverrno != 0) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_LVOL, "Lvol unregistered with errno %d\n", bdeverrno); + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->ref_count != 0) { + /* An lvol is still open, don't unload whole lvol store. */ + return; + } + } + spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev); +} + +static void +_vbdev_lvs_remove(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg, + bool destroy) +{ + struct spdk_lvs_req *req; + struct lvol_store_bdev *lvs_bdev; + struct spdk_lvol *lvol, *tmp; + bool all_lvols_closed = true; + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENODEV); + } + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + if (cb_fn != NULL) { + cb_fn(cb_arg, -ENOMEM); + } + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + lvs_bdev->req = req; + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->ref_count != 0) { + all_lvols_closed = false; + } + } + + if (all_lvols_closed == true) { + if (destroy) { + spdk_lvs_destroy(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + } else { + spdk_lvs_unload(lvs, _vbdev_lvs_remove_cb, lvs_bdev); + } + } else { + lvs->destruct = destroy; + if (destroy) { + _vbdev_lvs_remove_lvol_cb(lvs_bdev, 0); + } else { + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + spdk_bdev_unregister(lvol->bdev, _vbdev_lvs_remove_bdev_unregistered_cb, lvs_bdev); + } + } + } +} + +void +vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + _vbdev_lvs_remove(lvs, cb_fn, cb_arg, false); +} + +void +vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + _vbdev_lvs_remove(lvs, cb_fn, cb_arg, true); +} + +struct lvol_store_bdev * +vbdev_lvol_store_first(void) +{ + struct lvol_store_bdev *lvs_bdev; + + lvs_bdev = TAILQ_FIRST(&g_spdk_lvol_pairs); + if (lvs_bdev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Starting lvolstore iteration at %p\n", lvs_bdev->lvs); + } + + return lvs_bdev; +} + +struct lvol_store_bdev * +vbdev_lvol_store_next(struct lvol_store_bdev *prev) +{ + struct lvol_store_bdev *lvs_bdev; + + if (prev == NULL) { + SPDK_ERRLOG("prev argument cannot be NULL\n"); + return NULL; + } + + lvs_bdev = TAILQ_NEXT(prev, lvol_stores); + if (lvs_bdev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Continuing lvolstore iteration at %p\n", lvs_bdev->lvs); + } + + return lvs_bdev; +} + +static struct spdk_lvol_store * +_vbdev_get_lvol_store_by_uuid(const struct spdk_uuid *uuid) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (spdk_uuid_compare(&lvs->uuid, uuid) == 0) { + return lvs; + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + return NULL; +} + +struct spdk_lvol_store * +vbdev_get_lvol_store_by_uuid(const char *uuid_str) +{ + struct spdk_uuid uuid; + + if (spdk_uuid_parse(&uuid, uuid_str)) { + return NULL; + } + + return _vbdev_get_lvol_store_by_uuid(&uuid); +} + +struct spdk_lvol_store * +vbdev_get_lvol_store_by_name(const char *name) +{ + struct spdk_lvol_store *lvs = NULL; + struct lvol_store_bdev *lvs_bdev = vbdev_lvol_store_first(); + + while (lvs_bdev != NULL) { + lvs = lvs_bdev->lvs; + if (strncmp(lvs->name, name, sizeof(lvs->name)) == 0) { + return lvs; + } + lvs_bdev = vbdev_lvol_store_next(lvs_bdev); + } + return NULL; +} + +struct vbdev_lvol_destroy_ctx { + struct spdk_lvol *lvol; + spdk_lvol_op_complete cb_fn; + void *cb_arg; +}; + +static void +_vbdev_lvol_unregister_cb(void *ctx, int lvolerrno) +{ + struct spdk_bdev *bdev = ctx; + + spdk_bdev_destruct_done(bdev, lvolerrno); + free(bdev); +} + +static int +vbdev_lvol_unregister(void *ctx) +{ + struct spdk_lvol *lvol = ctx; + + assert(lvol != NULL); + + spdk_bdev_alias_del_all(lvol->bdev); + spdk_lvol_close(lvol, _vbdev_lvol_unregister_cb, lvol->bdev); + + /* return 1 to indicate we have an operation that must finish asynchronously before the + * lvol is closed + */ + return 1; +} + +static void +_vbdev_lvol_destroy_cb(void *cb_arg, int bdeverrno) +{ + struct vbdev_lvol_destroy_ctx *ctx = cb_arg; + struct spdk_lvol *lvol = ctx->lvol; + + if (bdeverrno < 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Could not unregister bdev during lvol (%s) destroy\n", + lvol->unique_id); + ctx->cb_fn(ctx->cb_arg, bdeverrno); + free(ctx); + return; + } + + spdk_lvol_destroy(lvol, ctx->cb_fn, ctx->cb_arg); + free(ctx); +} + +void +vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct vbdev_lvol_destroy_ctx *ctx; + + assert(lvol != NULL); + assert(cb_fn != NULL); + + /* Check if it is possible to delete lvol */ + if (spdk_lvol_deletable(lvol) == false) { + /* throw an error */ + SPDK_ERRLOG("Cannot delete lvol\n"); + cb_fn(cb_arg, -EPERM); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->lvol = lvol; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_bdev_unregister(lvol->bdev, _vbdev_lvol_destroy_cb, ctx); +} + +static char * +vbdev_lvol_find_name(struct spdk_lvol *lvol, spdk_blob_id blob_id) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvol *_lvol; + + assert(lvol != NULL); + + lvs = lvol->lvol_store; + + assert(lvs); + + TAILQ_FOREACH(_lvol, &lvs->lvols, link) { + if (_lvol->blob_id == blob_id) { + return _lvol->name; + } + } + + return NULL; +} + +static int +vbdev_lvol_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct spdk_lvol *lvol = ctx; + struct lvol_store_bdev *lvs_bdev; + struct spdk_bdev *bdev; + struct spdk_blob *blob; + char lvol_store_uuid[SPDK_UUID_STRING_LEN]; + spdk_blob_id *ids = NULL; + size_t count, i; + char *name; + int rc = 0; + + spdk_json_write_name(w, "lvol"); + spdk_json_write_object_begin(w); + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store); + if (!lvs_bdev) { + SPDK_ERRLOG("No such lvol store found\n"); + rc = -ENODEV; + goto end; + } + + bdev = lvs_bdev->bdev; + + spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol->lvol_store->uuid); + spdk_json_write_name(w, "lvol_store_uuid"); + spdk_json_write_string(w, lvol_store_uuid); + + spdk_json_write_name(w, "base_bdev"); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + + blob = lvol->blob; + + spdk_json_write_name(w, "thin_provision"); + spdk_json_write_bool(w, spdk_blob_is_thin_provisioned(blob)); + + spdk_json_write_name(w, "snapshot"); + spdk_json_write_bool(w, spdk_blob_is_snapshot(blob)); + + spdk_json_write_name(w, "clone"); + spdk_json_write_bool(w, spdk_blob_is_clone(blob)); + + if (spdk_blob_is_clone(blob)) { + spdk_blob_id snapshotid = spdk_blob_get_parent_snapshot(lvol->lvol_store->blobstore, lvol->blob_id); + if (snapshotid != SPDK_BLOBID_INVALID) { + name = vbdev_lvol_find_name(lvol, snapshotid); + if (name != NULL) { + spdk_json_write_name(w, "base_snapshot"); + spdk_json_write_string(w, name); + } else { + SPDK_ERRLOG("Cannot obtain snapshots name\n"); + } + } + } + + if (spdk_blob_is_snapshot(blob)) { + /* Take a number of clones */ + rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); + if (rc == -ENOMEM && count > 0) { + ids = malloc(sizeof(spdk_blob_id) * count); + if (ids == NULL) { + SPDK_ERRLOG("Cannot allocate memory\n"); + rc = -ENOMEM; + goto end; + } + + rc = spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, ids, &count); + if (rc == 0) { + spdk_json_write_name(w, "clones"); + spdk_json_write_array_begin(w); + for (i = 0; i < count; i++) { + name = vbdev_lvol_find_name(lvol, ids[i]); + if (name != NULL) { + spdk_json_write_string(w, name); + } else { + SPDK_ERRLOG("Cannot obtain clone name\n"); + } + + } + spdk_json_write_array_end(w); + } + free(ids); + } + + } + +end: + spdk_json_write_object_end(w); + + return rc; +} + +static void +vbdev_lvol_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* Nothing to dump as lvol configuration is saved on physical device. */ +} + +static struct spdk_io_channel * +vbdev_lvol_get_io_channel(void *ctx) +{ + struct spdk_lvol *lvol = ctx; + + return spdk_lvol_get_io_channel(lvol); +} + +static bool +vbdev_lvol_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct spdk_lvol *lvol = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return !spdk_blob_is_read_only(lvol->blob); + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_READ: + return true; + default: + return false; + } +} + +static void +lvol_op_comp(void *cb_arg, int bserrno) +{ + struct lvol_task *task = cb_arg; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(task); + + if (bserrno != 0) { + if (bserrno == -ENOMEM) { + task->status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + task->status = SPDK_BDEV_IO_STATUS_FAILED; + } + } + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev processing callback on device %s with type %d\n", + bdev_io->bdev->name, bdev_io->type); + spdk_bdev_io_complete(bdev_io, task->status); +} + +static void +lvol_unmap(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + struct lvol_task *task = (struct lvol_task *)bdev_io->driver_ctx; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing unmap at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_unmap(blob, ch, start_page, num_pages, lvol_op_comp, task); +} + +static void +lvol_write_zeroes(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + struct lvol_task *task = (struct lvol_task *)bdev_io->driver_ctx; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing write zeros at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_write_zeroes(blob, ch, start_page, num_pages, lvol_op_comp, task); +} + +static void +lvol_read(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_lvol *lvol = bdev_io->bdev->ctxt; + struct spdk_blob *blob = lvol->blob; + struct lvol_task *task = (struct lvol_task *)bdev_io->driver_ctx; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing read at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_readv(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page, + num_pages, lvol_op_comp, task); +} + +static void +lvol_write(struct spdk_lvol *lvol, struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint64_t start_page, num_pages; + struct spdk_blob *blob = lvol->blob; + struct lvol_task *task = (struct lvol_task *)bdev_io->driver_ctx; + + start_page = bdev_io->u.bdev.offset_blocks; + num_pages = bdev_io->u.bdev.num_blocks; + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Vbdev doing write at offset %" PRIu64 " using %" PRIu64 " pages on device %s\n", start_page, + num_pages, bdev_io->bdev->name); + spdk_blob_io_writev(blob, ch, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, start_page, + num_pages, lvol_op_comp, task); +} + +static int +lvol_reset(struct spdk_bdev_io *bdev_io) +{ + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + + return 0; +} + +static void +vbdev_lvol_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_lvol *lvol = bdev_io->bdev->ctxt; + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Vbdev request type %d submitted\n", bdev_io->type); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, lvol_read, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + lvol_write(lvol, ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + lvol_reset(bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + lvol_unmap(lvol, ch, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + lvol_write_zeroes(lvol, ch, bdev_io); + break; + default: + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "lvol: unsupported I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + return; +} + +static struct spdk_bdev_fn_table vbdev_lvol_fn_table = { + .destruct = vbdev_lvol_unregister, + .io_type_supported = vbdev_lvol_io_type_supported, + .submit_request = vbdev_lvol_submit_request, + .get_io_channel = vbdev_lvol_get_io_channel, + .dump_info_json = vbdev_lvol_dump_info_json, + .write_config_json = vbdev_lvol_write_config_json, +}; + +static void +_spdk_lvol_destroy_cb(void *cb_arg, int bdeverrno) +{ +} + +static void +_create_lvol_disk_destroy_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_lvol *lvol = cb_arg; + + if (bdeverrno < 0) { + SPDK_ERRLOG("Could not unregister bdev for lvol %s\n", + lvol->unique_id); + return; + } + + spdk_lvol_destroy(lvol, _spdk_lvol_destroy_cb, NULL); +} + +static void +_create_lvol_disk_unload_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_lvol *lvol = cb_arg; + + if (bdeverrno < 0) { + SPDK_ERRLOG("Could not unregister bdev for lvol %s\n", + lvol->unique_id); + return; + } + + TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link); + free(lvol->unique_id); + free(lvol); +} + +static int +_create_lvol_disk(struct spdk_lvol *lvol, bool destroy) +{ + struct spdk_bdev *bdev; + struct lvol_store_bdev *lvs_bdev; + uint64_t total_size; + unsigned char *alias; + int rc; + + if (!lvol->unique_id) { + return -EINVAL; + } + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvol->lvol_store); + if (lvs_bdev == NULL) { + SPDK_ERRLOG("No spdk lvs-bdev pair found for lvol %s\n", lvol->unique_id); + return -ENODEV; + } + + bdev = calloc(1, sizeof(struct spdk_bdev)); + if (!bdev) { + SPDK_ERRLOG("Cannot alloc memory for lvol bdev\n"); + return -ENOMEM; + } + + bdev->name = lvol->unique_id; + bdev->product_name = "Logical Volume"; + bdev->blocklen = spdk_bs_get_io_unit_size(lvol->lvol_store->blobstore); + total_size = spdk_blob_get_num_clusters(lvol->blob) * + spdk_bs_get_cluster_size(lvol->lvol_store->blobstore); + assert((total_size % bdev->blocklen) == 0); + bdev->blockcnt = total_size / bdev->blocklen; + bdev->uuid = lvol->uuid; + bdev->need_aligned_buffer = lvs_bdev->bdev->need_aligned_buffer; + bdev->split_on_optimal_io_boundary = true; + bdev->optimal_io_boundary = spdk_bs_get_cluster_size(lvol->lvol_store->blobstore) / bdev->blocklen; + + bdev->ctxt = lvol; + bdev->fn_table = &vbdev_lvol_fn_table; + bdev->module = &g_lvol_if; + + rc = spdk_vbdev_register(bdev, &lvs_bdev->bdev, 1); + if (rc) { + free(bdev); + return rc; + } + lvol->bdev = bdev; + + alias = spdk_sprintf_alloc("%s/%s", lvs_bdev->lvs->name, lvol->name); + if (alias == NULL) { + SPDK_ERRLOG("Cannot alloc memory for alias\n"); + spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb : + _create_lvol_disk_unload_cb), lvol); + return -ENOMEM; + } + + rc = spdk_bdev_alias_add(bdev, alias); + if (rc != 0) { + SPDK_ERRLOG("Cannot add alias to lvol bdev\n"); + spdk_bdev_unregister(lvol->bdev, (destroy ? _create_lvol_disk_destroy_cb : + _create_lvol_disk_unload_cb), lvol); + } + free(alias); + + return rc; +} + +static void +_vbdev_lvol_create_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + + if (lvolerrno < 0) { + goto end; + } + + lvolerrno = _create_lvol_disk(lvol, true); + +end: + req->cb_fn(req->cb_arg, lvol, lvolerrno); + free(req); +} + +int +vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provision, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + return -ENOMEM; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + rc = spdk_lvol_create(lvs, name, sz, thin_provision, _vbdev_lvol_create_cb, req); + if (rc != 0) { + free(req); + } + + return rc; +} + +void +vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_create_snapshot(lvol, snapshot_name, _vbdev_lvol_create_cb, req); +} + +void +vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_create_clone(lvol, clone_name, _vbdev_lvol_create_cb, req); +} + +static void +_vbdev_lvol_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Renaming lvol failed\n"); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + int rc; + + rc = _vbdev_lvol_change_bdev_alias(lvol, new_lvol_name); + if (rc != 0) { + SPDK_ERRLOG("renaming lvol to '%s' does not succeed\n", new_lvol_name); + cb_fn(cb_arg, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_lvol_rename(lvol, new_lvol_name, _vbdev_lvol_rename_cb, req); +} + +static void +_vbdev_lvol_resize_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + uint64_t total_size; + + /* change bdev size */ + if (lvolerrno != 0) { + SPDK_ERRLOG("CB function for bdev lvol %s receive error no: %d.\n", lvol->name, lvolerrno); + goto finish; + } + + total_size = spdk_blob_get_num_clusters(lvol->blob) * + spdk_bs_get_cluster_size(lvol->lvol_store->blobstore); + assert((total_size % lvol->bdev->blocklen) == 0); + + lvolerrno = spdk_bdev_notify_blockcnt_change(lvol->bdev, total_size / lvol->bdev->blocklen); + if (lvolerrno != 0) { + SPDK_ERRLOG("Could not change num blocks for bdev lvol %s with error no: %d.\n", + lvol->name, lvolerrno); + } + +finish: + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + assert(lvol->bdev != NULL); + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->sz = sz; + req->lvol = lvol; + + spdk_lvol_resize(req->lvol, req->sz, _vbdev_lvol_resize_cb, req); +} + +static int +vbdev_lvs_init(void) +{ + return 0; +} + +static int +vbdev_lvs_get_ctx_size(void) +{ + return sizeof(struct lvol_task); +} + +static void +_vbdev_lvs_examine_failed(void *cb_arg, int lvserrno) +{ + spdk_bdev_module_examine_done(&g_lvol_if); +} + +static void +_vbdev_lvol_examine_close_cb(struct spdk_lvol_store *lvs) +{ + if (lvs->lvols_opened >= lvs->lvol_count) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } +} + +static void +_vbdev_lvs_examine_finish(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_lvol_store *lvs = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Error opening lvol %s\n", lvol->unique_id); + TAILQ_REMOVE(&lvs->lvols, lvol, link); + lvs->lvol_count--; + free(lvol->unique_id); + free(lvol); + goto end; + } + + if (_create_lvol_disk(lvol, false)) { + SPDK_ERRLOG("Cannot create bdev for lvol %s\n", lvol->unique_id); + lvs->lvol_count--; + _vbdev_lvol_examine_close_cb(lvs); + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s failed\n", lvol->unique_id); + return; + } + + lvs->lvols_opened++; + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvol %s succeeded\n", lvol->unique_id); + +end: + + if (lvs->lvols_opened >= lvs->lvol_count) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Opening lvols finished\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } +} + +static void +_vbdev_lvs_examine_cb(void *arg, struct spdk_lvol_store *lvol_store, int lvserrno) +{ + struct lvol_store_bdev *lvs_bdev; + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)arg; + struct spdk_lvol *lvol, *tmp; + + if (lvserrno == -EEXIST) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, + "Name for lvolstore on device %s conflicts with name for already loaded lvs\n", + req->base_bdev->name); + /* On error blobstore destroys bs_dev itself */ + spdk_bdev_module_examine_done(&g_lvol_if); + goto end; + } else if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store not found on %s\n", req->base_bdev->name); + /* On error blobstore destroys bs_dev itself */ + spdk_bdev_module_examine_done(&g_lvol_if); + goto end; + } + + lvserrno = spdk_bs_bdev_claim(lvol_store->bs_dev, &g_lvol_if); + if (lvserrno != 0) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store base bdev already claimed by another bdev\n"); + spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL); + goto end; + } + + lvs_bdev = calloc(1, sizeof(*lvs_bdev)); + if (!lvs_bdev) { + SPDK_ERRLOG("Cannot alloc memory for lvs_bdev\n"); + spdk_lvs_unload(lvol_store, _vbdev_lvs_examine_failed, NULL); + goto end; + } + + lvs_bdev->lvs = lvol_store; + lvs_bdev->bdev = req->base_bdev; + + TAILQ_INSERT_TAIL(&g_spdk_lvol_pairs, lvs_bdev, lvol_stores); + + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store found on %s - begin parsing\n", + req->base_bdev->name); + + lvol_store->lvols_opened = 0; + + if (TAILQ_EMPTY(&lvol_store->lvols)) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Lvol store examination done\n"); + spdk_bdev_module_examine_done(&g_lvol_if); + } else { + /* Open all lvols */ + TAILQ_FOREACH_SAFE(lvol, &lvol_store->lvols, link, tmp) { + spdk_lvol_open(lvol, _vbdev_lvs_examine_finish, lvol_store); + } + } + +end: + free(req); +} + +static void +vbdev_lvs_examine(struct spdk_bdev *bdev) +{ + struct spdk_bs_dev *bs_dev; + struct spdk_lvs_with_handle_req *req; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + spdk_bdev_module_examine_done(&g_lvol_if); + SPDK_ERRLOG("Cannot alloc memory for vbdev lvol store request pointer\n"); + return; + } + + bs_dev = spdk_bdev_create_bs_dev(bdev, vbdev_lvs_hotremove_cb, bdev); + if (!bs_dev) { + SPDK_INFOLOG(SPDK_LOG_VBDEV_LVOL, "Cannot create bs dev on %s\n", bdev->name); + spdk_bdev_module_examine_done(&g_lvol_if); + free(req); + return; + } + + req->base_bdev = bdev; + + spdk_lvs_load(bs_dev, _vbdev_lvs_examine_cb, req); +} + +struct spdk_lvol * +vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev) +{ + if (!bdev || bdev->module != &g_lvol_if) { + return NULL; + } + + if (bdev->ctxt == NULL) { + SPDK_ERRLOG("No lvol ctx assigned to bdev %s\n", bdev->name); + return NULL; + } + + return (struct spdk_lvol *)bdev->ctxt; +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_lvol", SPDK_LOG_VBDEV_LVOL); diff --git a/src/spdk/lib/bdev/lvol/vbdev_lvol.h b/src/spdk/lib/bdev/lvol/vbdev_lvol.h new file mode 100644 index 00000000..93991d08 --- /dev/null +++ b/src/spdk/lib/bdev/lvol/vbdev_lvol.h @@ -0,0 +1,120 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_LVOL_H +#define SPDK_VBDEV_LVOL_H + +#include "spdk/lvol.h" +#include "spdk/bdev_module.h" + +#include "spdk_internal/lvolstore.h" + +struct lvol_store_bdev { + struct spdk_lvol_store *lvs; + struct spdk_bdev *bdev; + struct spdk_lvs_req *req; + + TAILQ_ENTRY(lvol_store_bdev) lvol_stores; +}; + +int vbdev_lvs_create(struct spdk_bdev *base_bdev, const char *name, uint32_t cluster_sz, + spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg); +void vbdev_lvs_destruct(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg); +void vbdev_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, void *cb_arg); + +int vbdev_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provisioned, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg); + +void vbdev_lvol_create_snapshot(struct spdk_lvol *lvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg); + +void vbdev_lvol_create_clone(struct spdk_lvol *lvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg); + +/** + * \brief Change size of lvol + * \param lvol Handle to lvol + * \param sz Size of lvol to change + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + * \return error + */ +void vbdev_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, spdk_lvol_op_complete cb_fn, + void *cb_arg); + +void vbdev_lvol_rename(struct spdk_lvol *lvol, const char *new_lvol_name, + spdk_lvol_op_complete cb_fn, void *cb_arg); + +/** + * Destroy a logical volume + * \param lvol Handle to lvol + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void vbdev_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg); + +/** + * \brief Renames given lvolstore. + * + * \param lvs Pointer to lvolstore + * \param new_name New name of lvs + * \param cb_fn Completion callback + * \param cb_arg Completion callback custom arguments + */ +void vbdev_lvs_rename(struct spdk_lvol_store *lvs, const char *new_lvs_name, + spdk_lvs_op_complete cb_fn, void *cb_arg); + +/** + * \brief Search for handle lvolstore + * \param uuid_str UUID of lvolstore + * \return Handle to spdk_lvol_store or NULL if not found. + */ +struct spdk_lvol_store *vbdev_get_lvol_store_by_uuid(const char *uuid_str); + +/** + * \brief Search for handle to lvolstore + * \param name name of lvolstore + * \return Handle to spdk_lvol_store or NULL if not found. + */ +struct spdk_lvol_store *vbdev_get_lvol_store_by_name(const char *name); + +/** + * \brief Search for handle to lvol_store_bdev + * \param lvs handle to lvolstore + * \return Handle to lvol_store_bdev or NULL if not found. + */ +struct lvol_store_bdev *vbdev_get_lvs_bdev_by_lvs(struct spdk_lvol_store *lvs); + +struct spdk_lvol *vbdev_lvol_get_from_bdev(struct spdk_bdev *bdev); + +#endif /* SPDK_VBDEV_LVOL_H */ diff --git a/src/spdk/lib/bdev/lvol/vbdev_lvol_rpc.c b/src/spdk/lib/bdev/lvol/vbdev_lvol_rpc.c new file mode 100644 index 00000000..30f67f35 --- /dev/null +++ b/src/spdk/lib/bdev/lvol/vbdev_lvol_rpc.c @@ -0,0 +1,1089 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/bdev.h" +#include "spdk/util.h" +#include "vbdev_lvol.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +SPDK_LOG_REGISTER_COMPONENT("lvolrpc", SPDK_LOG_LVOL_RPC) + +struct rpc_construct_lvol_store { + char *lvs_name; + char *bdev_name; + uint32_t cluster_sz; +}; + +static int +vbdev_get_lvol_store_by_uuid_xor_name(const char *uuid, const char *lvs_name, + struct spdk_lvol_store **lvs) +{ + if ((uuid == NULL && lvs_name == NULL)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "lvs UUID nor lvs name specified\n"); + return -EINVAL; + } else if ((uuid && lvs_name)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "both lvs UUID '%s' and lvs name '%s' specified\n", uuid, + lvs_name); + return -EINVAL; + } else if (uuid) { + *lvs = vbdev_get_lvol_store_by_uuid(uuid); + + if (*lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with UUID '%s' not found\n", uuid); + return -ENODEV; + } + } else if (lvs_name) { + + *lvs = vbdev_get_lvol_store_by_name(lvs_name); + + if (*lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "blobstore with name '%s' not found\n", lvs_name); + return -ENODEV; + } + } + return 0; +} + +static void +free_rpc_construct_lvol_store(struct rpc_construct_lvol_store *req) +{ + free(req->bdev_name); + free(req->lvs_name); +} + +static const struct spdk_json_object_decoder rpc_construct_lvol_store_decoders[] = { + {"bdev_name", offsetof(struct rpc_construct_lvol_store, bdev_name), spdk_json_decode_string}, + {"cluster_sz", offsetof(struct rpc_construct_lvol_store, cluster_sz), spdk_json_decode_uint32, true}, + {"lvs_name", offsetof(struct rpc_construct_lvol_store, lvs_name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_lvol_store_construct_cb(void *cb_arg, struct spdk_lvol_store *lvol_store, int lvserrno) +{ + struct spdk_json_write_ctx *w; + char lvol_store_uuid[SPDK_UUID_STRING_LEN]; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + spdk_uuid_fmt_lower(lvol_store_uuid, sizeof(lvol_store_uuid), &lvol_store->uuid); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, lvol_store_uuid); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +spdk_rpc_construct_lvol_store(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_lvol_store req = {}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_lvol_store_decoders, + SPDK_COUNTOF(rpc_construct_lvol_store_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + if (req.bdev_name == NULL) { + SPDK_ERRLOG("missing bdev_name param\n"); + rc = -EINVAL; + goto invalid; + } + + if (req.lvs_name == NULL) { + SPDK_ERRLOG("missing lvs_name param\n"); + rc = -EINVAL; + goto invalid; + } + bdev = spdk_bdev_get_by_name(req.bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.bdev_name); + rc = -ENODEV; + goto invalid; + } + + rc = vbdev_lvs_create(bdev, req.lvs_name, req.cluster_sz, _spdk_rpc_lvol_store_construct_cb, + request); + if (rc < 0) { + goto invalid; + } + free_rpc_construct_lvol_store(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_construct_lvol_store(&req); +} +SPDK_RPC_REGISTER("construct_lvol_store", spdk_rpc_construct_lvol_store, SPDK_RPC_RUNTIME) + +struct rpc_rename_lvol_store { + char *old_name; + char *new_name; +}; + +static void +free_rpc_rename_lvol_store(struct rpc_rename_lvol_store *req) +{ + free(req->old_name); + free(req->new_name); +} + +static const struct spdk_json_object_decoder rpc_rename_lvol_store_decoders[] = { + {"old_name", offsetof(struct rpc_rename_lvol_store, old_name), spdk_json_decode_string}, + {"new_name", offsetof(struct rpc_rename_lvol_store, new_name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_rename_lvol_store_cb(void *cb_arg, int lvserrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +spdk_rpc_rename_lvol_store(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_rename_lvol_store req = {}; + struct spdk_lvol_store *lvs; + int rc; + + if (spdk_json_decode_object(params, rpc_rename_lvol_store_decoders, + SPDK_COUNTOF(rpc_rename_lvol_store_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + lvs = vbdev_get_lvol_store_by_name(req.old_name); + if (lvs == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "no lvs existing for given name\n"); + rc = -ENOENT; + goto invalid; + } + + vbdev_lvs_rename(lvs, req.new_name, _spdk_rpc_rename_lvol_store_cb, request); + + free_rpc_rename_lvol_store(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_rename_lvol_store(&req); +} +SPDK_RPC_REGISTER("rename_lvol_store", spdk_rpc_rename_lvol_store, SPDK_RPC_RUNTIME) + +struct rpc_destroy_lvol_store { + char *uuid; + char *lvs_name; +}; + +static void +free_rpc_destroy_lvol_store(struct rpc_destroy_lvol_store *req) +{ + free(req->uuid); + free(req->lvs_name); +} + +static const struct spdk_json_object_decoder rpc_destroy_lvol_store_decoders[] = { + {"uuid", offsetof(struct rpc_destroy_lvol_store, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_destroy_lvol_store, lvs_name), spdk_json_decode_string, true}, +}; + +static void +_spdk_rpc_lvol_store_destroy_cb(void *cb_arg, int lvserrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvserrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvserrno)); +} + +static void +spdk_rpc_destroy_lvol_store(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_destroy_lvol_store req = {}; + struct spdk_lvol_store *lvs = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_destroy_lvol_store_decoders, + SPDK_COUNTOF(rpc_destroy_lvol_store_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + goto invalid; + } + + vbdev_lvs_destruct(lvs, _spdk_rpc_lvol_store_destroy_cb, request); + + free_rpc_destroy_lvol_store(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_destroy_lvol_store(&req); +} +SPDK_RPC_REGISTER("destroy_lvol_store", spdk_rpc_destroy_lvol_store, SPDK_RPC_RUNTIME) + +struct rpc_construct_lvol_bdev { + char *uuid; + char *lvs_name; + char *lvol_name; + uint64_t size; + bool thin_provision; +}; + +static void +free_rpc_construct_lvol_bdev(struct rpc_construct_lvol_bdev *req) +{ + free(req->uuid); + free(req->lvs_name); + free(req->lvol_name); +} + +static const struct spdk_json_object_decoder rpc_construct_lvol_bdev_decoders[] = { + {"uuid", offsetof(struct rpc_construct_lvol_bdev, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_construct_lvol_bdev, lvs_name), spdk_json_decode_string, true}, + {"lvol_name", offsetof(struct rpc_construct_lvol_bdev, lvol_name), spdk_json_decode_string, true}, + {"size", offsetof(struct rpc_construct_lvol_bdev, size), spdk_json_decode_uint64}, + {"thin_provision", offsetof(struct rpc_construct_lvol_bdev, thin_provision), spdk_json_decode_bool, true}, +}; + +static void +_spdk_rpc_construct_lvol_bdev_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_construct_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_lvol_bdev req = {}; + int rc; + struct spdk_lvol_store *lvs = NULL; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Creating blob\n"); + + if (spdk_json_decode_object(params, rpc_construct_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_construct_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + goto invalid; + } + + if (req.lvol_name == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "no bdev name\n"); + rc = -EINVAL; + goto invalid; + } + + rc = vbdev_lvol_create(lvs, req.lvol_name, req.size, req.thin_provision, + _spdk_rpc_construct_lvol_bdev_cb, request); + if (rc < 0) { + goto invalid; + } + + free_rpc_construct_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_construct_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("construct_lvol_bdev", spdk_rpc_construct_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_snapshot_lvol_bdev { + char *lvol_name; + char *snapshot_name; +}; + +static void +free_rpc_snapshot_lvol_bdev(struct rpc_snapshot_lvol_bdev *req) +{ + free(req->lvol_name); + free(req->snapshot_name); +} + +static const struct spdk_json_object_decoder rpc_snapshot_lvol_bdev_decoders[] = { + {"lvol_name", offsetof(struct rpc_snapshot_lvol_bdev, lvol_name), spdk_json_decode_string}, + {"snapshot_name", offsetof(struct rpc_snapshot_lvol_bdev, snapshot_name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_snapshot_lvol_bdev_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_snapshot_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_snapshot_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Snapshotting blob\n"); + + if (spdk_json_decode_object(params, rpc_snapshot_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_snapshot_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.lvol_name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.lvol_name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + rc = -ENODEV; + goto invalid; + } + + vbdev_lvol_create_snapshot(lvol, req.snapshot_name, _spdk_rpc_snapshot_lvol_bdev_cb, request); + + free_rpc_snapshot_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_snapshot_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("snapshot_lvol_bdev", spdk_rpc_snapshot_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_clone_lvol_bdev { + char *snapshot_name; + char *clone_name; +}; + +static void +free_rpc_clone_lvol_bdev(struct rpc_clone_lvol_bdev *req) +{ + free(req->snapshot_name); + free(req->clone_name); +} + +static const struct spdk_json_object_decoder rpc_clone_lvol_bdev_decoders[] = { + {"snapshot_name", offsetof(struct rpc_clone_lvol_bdev, snapshot_name), spdk_json_decode_string}, + {"clone_name", offsetof(struct rpc_clone_lvol_bdev, clone_name), spdk_json_decode_string, true}, +}; + +static void +_spdk_rpc_clone_lvol_bdev_cb(void *cb_arg, struct spdk_lvol *lvol, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, lvol->unique_id); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_clone_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_clone_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Cloning blob\n"); + + if (spdk_json_decode_object(params, rpc_clone_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_clone_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.snapshot_name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "bdev '%s' does not exist\n", req.snapshot_name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + rc = -ENODEV; + goto invalid; + } + + vbdev_lvol_create_clone(lvol, req.clone_name, _spdk_rpc_clone_lvol_bdev_cb, request); + + free_rpc_clone_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_clone_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("clone_lvol_bdev", spdk_rpc_clone_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_rename_lvol_bdev { + char *old_name; + char *new_name; +}; + +static void +free_rpc_rename_lvol_bdev(struct rpc_rename_lvol_bdev *req) +{ + free(req->old_name); + free(req->new_name); +} + +static const struct spdk_json_object_decoder rpc_rename_lvol_bdev_decoders[] = { + {"old_name", offsetof(struct rpc_rename_lvol_bdev, old_name), spdk_json_decode_string}, + {"new_name", offsetof(struct rpc_rename_lvol_bdev, new_name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_rename_lvol_bdev_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_rename_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_rename_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc = 0; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Renaming lvol\n"); + + if (spdk_json_decode_object(params, rpc_rename_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_rename_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.old_name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.old_name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + rc = -ENODEV; + goto invalid; + } + + vbdev_lvol_rename(lvol, req.new_name, _spdk_rpc_rename_lvol_bdev_cb, request); + + free_rpc_rename_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_rename_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("rename_lvol_bdev", spdk_rpc_rename_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_inflate_lvol_bdev { + char *name; +}; + +static void +free_rpc_inflate_lvol_bdev(struct rpc_inflate_lvol_bdev *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_inflate_lvol_bdev_decoders[] = { + {"name", offsetof(struct rpc_inflate_lvol_bdev, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_inflate_lvol_bdev_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_inflate_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_inflate_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc = 0; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Inflating lvol\n"); + + if (spdk_json_decode_object(params, rpc_inflate_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_inflate_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + rc = -ENODEV; + goto invalid; + } + + spdk_lvol_inflate(lvol, _spdk_rpc_inflate_lvol_bdev_cb, request); + + free_rpc_inflate_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_inflate_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("inflate_lvol_bdev", spdk_rpc_inflate_lvol_bdev, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_decouple_parent_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_inflate_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc = 0; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Decoupling parent of lvol\n"); + + if (spdk_json_decode_object(params, rpc_inflate_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_inflate_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + rc = -ENODEV; + goto invalid; + } + + spdk_lvol_decouple_parent(lvol, _spdk_rpc_inflate_lvol_bdev_cb, request); + + free_rpc_inflate_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + free_rpc_inflate_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("decouple_parent_lvol_bdev", spdk_rpc_decouple_parent_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_resize_lvol_bdev { + char *name; + uint64_t size; +}; + +static void +free_rpc_resize_lvol_bdev(struct rpc_resize_lvol_bdev *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_resize_lvol_bdev_decoders[] = { + {"name", offsetof(struct rpc_resize_lvol_bdev, name), spdk_json_decode_string}, + {"size", offsetof(struct rpc_resize_lvol_bdev, size), spdk_json_decode_uint64}, +}; + +static void +_spdk_rpc_resize_lvol_bdev_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_resize_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_resize_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc = 0; + + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "Resizing lvol\n"); + + if (spdk_json_decode_object(params, rpc_resize_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_resize_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + if (req.name == NULL) { + SPDK_ERRLOG("missing name param\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev for provided name %s\n", req.name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + rc = -ENODEV; + goto invalid; + } + + vbdev_lvol_resize(lvol, req.size, _spdk_rpc_resize_lvol_bdev_cb, request); + + free_rpc_resize_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_resize_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("resize_lvol_bdev", spdk_rpc_resize_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_destroy_lvol_bdev { + char *name; +}; + +static void +free_rpc_destroy_lvol_bdev(struct rpc_destroy_lvol_bdev *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_destroy_lvol_bdev_decoders[] = { + {"name", offsetof(struct rpc_destroy_lvol_bdev, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_destroy_lvol_bdev_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request = cb_arg; + + if (lvolerrno != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-lvolerrno)); +} + +static void +spdk_rpc_destroy_lvol_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_destroy_lvol_bdev req = {}; + struct spdk_bdev *bdev; + struct spdk_lvol *lvol; + int rc; + + if (spdk_json_decode_object(params, rpc_destroy_lvol_bdev_decoders, + SPDK_COUNTOF(rpc_destroy_lvol_bdev_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev for provided name %s\n", req.name); + rc = -ENODEV; + goto invalid; + } + + lvol = vbdev_lvol_get_from_bdev(bdev); + if (lvol == NULL) { + rc = -ENODEV; + goto invalid; + } + + vbdev_lvol_destroy(lvol, _spdk_rpc_destroy_lvol_bdev_cb, request); + + free_rpc_destroy_lvol_bdev(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_destroy_lvol_bdev(&req); +} + +SPDK_RPC_REGISTER("destroy_lvol_bdev", spdk_rpc_destroy_lvol_bdev, SPDK_RPC_RUNTIME) + +struct rpc_get_lvol_stores { + char *uuid; + char *lvs_name; +}; + +static void +free_rpc_get_lvol_stores(struct rpc_get_lvol_stores *req) +{ + free(req->uuid); + free(req->lvs_name); +} + +static const struct spdk_json_object_decoder rpc_get_lvol_stores_decoders[] = { + {"uuid", offsetof(struct rpc_get_lvol_stores, uuid), spdk_json_decode_string, true}, + {"lvs_name", offsetof(struct rpc_get_lvol_stores, lvs_name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_dump_lvol_store_info(struct spdk_json_write_ctx *w, struct lvol_store_bdev *lvs_bdev) +{ + struct spdk_blob_store *bs; + uint64_t cluster_size, block_size; + char uuid[SPDK_UUID_STRING_LEN]; + + bs = lvs_bdev->lvs->blobstore; + cluster_size = spdk_bs_get_cluster_size(bs); + /* Block size of lvols is always size of blob store page */ + block_size = spdk_bs_get_page_size(bs); + + spdk_json_write_object_begin(w); + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs_bdev->lvs->uuid); + spdk_json_write_name(w, "uuid"); + spdk_json_write_string(w, uuid); + + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, lvs_bdev->lvs->name); + + spdk_json_write_name(w, "base_bdev"); + spdk_json_write_string(w, spdk_bdev_get_name(lvs_bdev->bdev)); + + spdk_json_write_name(w, "total_data_clusters"); + spdk_json_write_uint64(w, spdk_bs_total_data_cluster_count(bs)); + + spdk_json_write_name(w, "free_clusters"); + spdk_json_write_uint64(w, spdk_bs_free_cluster_count(bs)); + + spdk_json_write_name(w, "block_size"); + spdk_json_write_uint64(w, block_size); + + spdk_json_write_name(w, "cluster_size"); + spdk_json_write_uint64(w, cluster_size); + + spdk_json_write_object_end(w); +} + +static void +spdk_rpc_get_lvol_stores(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_lvol_stores req = {}; + struct spdk_json_write_ctx *w; + struct lvol_store_bdev *lvs_bdev = NULL; + struct spdk_lvol_store *lvs = NULL; + int rc; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_get_lvol_stores_decoders, + SPDK_COUNTOF(rpc_get_lvol_stores_decoders), + &req)) { + SPDK_INFOLOG(SPDK_LOG_LVOL_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = vbdev_get_lvol_store_by_uuid_xor_name(req.uuid, req.lvs_name, &lvs); + if (rc != 0) { + goto invalid; + } + + lvs_bdev = vbdev_get_lvs_bdev_by_lvs(lvs); + if (lvs_bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_get_lvol_stores(&req); + return; + } + + spdk_json_write_array_begin(w); + + if (lvs_bdev != NULL) { + spdk_rpc_dump_lvol_store_info(w, lvs_bdev); + } else { + for (lvs_bdev = vbdev_lvol_store_first(); lvs_bdev != NULL; + lvs_bdev = vbdev_lvol_store_next(lvs_bdev)) { + spdk_rpc_dump_lvol_store_info(w, lvs_bdev); + } + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + + free_rpc_get_lvol_stores(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_get_lvol_stores(&req); +} + +SPDK_RPC_REGISTER("get_lvol_stores", spdk_rpc_get_lvol_stores, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/malloc/Makefile b/src/spdk/lib/bdev/malloc/Makefile new file mode 100644 index 00000000..f4eb9aaa --- /dev/null +++ b/src/spdk/lib/bdev/malloc/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_malloc.c bdev_malloc_rpc.c +LIBNAME = bdev_malloc +LOCAL_SYS_LIBS = -luuid + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/malloc/bdev_malloc.c b/src/spdk/lib/bdev/malloc/bdev_malloc.c new file mode 100644 index 00000000..eb4b2b9c --- /dev/null +++ b/src/spdk/lib/bdev/malloc/bdev_malloc.c @@ -0,0 +1,524 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_malloc.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/copy_engine.h" +#include "spdk/json.h" +#include "spdk/thread.h" +#include "spdk/queue.h" +#include "spdk/string.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +struct malloc_disk { + struct spdk_bdev disk; + void *malloc_buf; + TAILQ_ENTRY(malloc_disk) link; +}; + +struct malloc_task { + int num_outstanding; + enum spdk_bdev_io_status status; +}; + +static struct malloc_task * +__malloc_task_from_copy_task(struct spdk_copy_task *ct) +{ + return (struct malloc_task *)((uintptr_t)ct - sizeof(struct malloc_task)); +} + +static struct spdk_copy_task * +__copy_task_from_malloc_task(struct malloc_task *mt) +{ + return (struct spdk_copy_task *)((uintptr_t)mt + sizeof(struct malloc_task)); +} + +static void +malloc_done(void *ref, int status) +{ + struct malloc_task *task = __malloc_task_from_copy_task(ref); + + if (status != 0) { + if (status == -ENOMEM) { + task->status = SPDK_BDEV_IO_STATUS_NOMEM; + } else { + task->status = SPDK_BDEV_IO_STATUS_FAILED; + } + } + + if (--task->num_outstanding == 0) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), task->status); + } +} + +static TAILQ_HEAD(, malloc_disk) g_malloc_disks = TAILQ_HEAD_INITIALIZER(g_malloc_disks); + +int malloc_disk_count = 0; + +static int bdev_malloc_initialize(void); +static void bdev_malloc_get_spdk_running_config(FILE *fp); + +static int +bdev_malloc_get_ctx_size(void) +{ + return sizeof(struct malloc_task) + spdk_copy_task_size(); +} + +static struct spdk_bdev_module malloc_if = { + .name = "malloc", + .module_init = bdev_malloc_initialize, + .config_text = bdev_malloc_get_spdk_running_config, + .get_ctx_size = bdev_malloc_get_ctx_size, + +}; + +SPDK_BDEV_MODULE_REGISTER(&malloc_if) + +static void +malloc_disk_free(struct malloc_disk *malloc_disk) +{ + if (!malloc_disk) { + return; + } + + free(malloc_disk->disk.name); + spdk_dma_free(malloc_disk->malloc_buf); + spdk_dma_free(malloc_disk); +} + +static int +bdev_malloc_destruct(void *ctx) +{ + struct malloc_disk *malloc_disk = ctx; + + TAILQ_REMOVE(&g_malloc_disks, malloc_disk, link); + malloc_disk_free(malloc_disk); + return 0; +} + +static int +bdev_malloc_check_iov_len(struct iovec *iovs, int iovcnt, size_t nbytes) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + if (nbytes < iovs[i].iov_len) { + return 0; + } + + nbytes -= iovs[i].iov_len; + } + + return nbytes != 0; +} + +static void +bdev_malloc_readv(struct malloc_disk *mdisk, struct spdk_io_channel *ch, + struct malloc_task *task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + int64_t res = 0; + void *src = mdisk->malloc_buf + offset; + int i; + + if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), + SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "read %lu bytes from offset %#lx\n", + len, offset); + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = iovcnt; + + for (i = 0; i < iovcnt; i++) { + res = spdk_copy_submit(__copy_task_from_malloc_task(task), + ch, iov[i].iov_base, + src, iov[i].iov_len, malloc_done); + + if (res != 0) { + malloc_done(__copy_task_from_malloc_task(task), res); + } + + src += iov[i].iov_len; + len -= iov[i].iov_len; + } +} + +static void +bdev_malloc_writev(struct malloc_disk *mdisk, struct spdk_io_channel *ch, + struct malloc_task *task, + struct iovec *iov, int iovcnt, size_t len, uint64_t offset) +{ + int64_t res = 0; + void *dst = mdisk->malloc_buf + offset; + int i; + + if (bdev_malloc_check_iov_len(iov, iovcnt, len)) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), + SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "wrote %lu bytes to offset %#lx\n", + len, offset); + + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = iovcnt; + + for (i = 0; i < iovcnt; i++) { + res = spdk_copy_submit(__copy_task_from_malloc_task(task), + ch, dst, iov[i].iov_base, + iov[i].iov_len, malloc_done); + + if (res != 0) { + malloc_done(__copy_task_from_malloc_task(task), res); + } + + dst += iov[i].iov_len; + } +} + +static int +bdev_malloc_unmap(struct malloc_disk *mdisk, + struct spdk_io_channel *ch, + struct malloc_task *task, + uint64_t offset, + uint64_t byte_count) +{ + task->status = SPDK_BDEV_IO_STATUS_SUCCESS; + task->num_outstanding = 1; + + return spdk_copy_submit_fill(__copy_task_from_malloc_task(task), ch, + mdisk->malloc_buf + offset, 0, byte_count, malloc_done); +} + +static int64_t +bdev_malloc_flush(struct malloc_disk *mdisk, struct malloc_task *task, + uint64_t offset, uint64_t nbytes) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static int +bdev_malloc_reset(struct malloc_disk *mdisk, struct malloc_task *task) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(task), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static int _bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + uint32_t block_size = bdev_io->bdev->blocklen; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { + assert(bdev_io->u.bdev.iovcnt == 1); + bdev_io->u.bdev.iovs[0].iov_base = + ((struct malloc_disk *)bdev_io->bdev->ctxt)->malloc_buf + + bdev_io->u.bdev.offset_blocks * block_size; + bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * block_size; + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bdev_io->driver_ctx), + SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + } + + bdev_malloc_readv((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * block_size, + bdev_io->u.bdev.offset_blocks * block_size); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_malloc_writev((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * block_size, + bdev_io->u.bdev.offset_blocks * block_size); + return 0; + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_malloc_reset((struct malloc_disk *)bdev_io->bdev->ctxt, + (struct malloc_task *)bdev_io->driver_ctx); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_malloc_flush((struct malloc_disk *)bdev_io->bdev->ctxt, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + case SPDK_BDEV_IO_TYPE_UNMAP: + return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + /* bdev_malloc_unmap is implemented with a call to mem_cpy_fill which zeroes out all of the requested bytes. */ + return bdev_malloc_unmap((struct malloc_disk *)bdev_io->bdev->ctxt, + ch, + (struct malloc_task *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks * block_size, + bdev_io->u.bdev.num_blocks * block_size); + + default: + return -1; + } + return 0; +} + +static void bdev_malloc_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_malloc_submit_request(ch, bdev_io) != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_malloc_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return true; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_malloc_get_io_channel(void *ctx) +{ + return spdk_copy_engine_get_io_channel(); +} + +static void +bdev_malloc_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_malloc_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table malloc_fn_table = { + .destruct = bdev_malloc_destruct, + .submit_request = bdev_malloc_submit_request, + .io_type_supported = bdev_malloc_io_type_supported, + .get_io_channel = bdev_malloc_get_io_channel, + .write_config_json = bdev_malloc_write_json_config, +}; + +struct spdk_bdev *create_malloc_disk(const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size) +{ + struct malloc_disk *mdisk; + int rc; + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk must be more than 0 blocks\n"); + return NULL; + } + + mdisk = spdk_dma_zmalloc(sizeof(*mdisk), 0, NULL); + if (!mdisk) { + SPDK_ERRLOG("mdisk spdk_dma_zmalloc() failed\n"); + return NULL; + } + + /* + * Allocate the large backend memory buffer from pinned memory. + * + * TODO: need to pass a hint so we know which socket to allocate + * from on multi-socket systems. + */ + mdisk->malloc_buf = spdk_dma_zmalloc(num_blocks * block_size, 2 * 1024 * 1024, NULL); + if (!mdisk->malloc_buf) { + SPDK_ERRLOG("malloc_buf spdk_dma_zmalloc() failed\n"); + malloc_disk_free(mdisk); + return NULL; + } + + if (name) { + mdisk->disk.name = strdup(name); + } else { + /* Auto-generate a name */ + mdisk->disk.name = spdk_sprintf_alloc("Malloc%d", malloc_disk_count); + malloc_disk_count++; + } + if (!mdisk->disk.name) { + malloc_disk_free(mdisk); + return NULL; + } + mdisk->disk.product_name = "Malloc disk"; + + mdisk->disk.write_cache = 1; + mdisk->disk.blocklen = block_size; + mdisk->disk.blockcnt = num_blocks; + if (uuid) { + mdisk->disk.uuid = *uuid; + } else { + spdk_uuid_generate(&mdisk->disk.uuid); + } + + mdisk->disk.ctxt = mdisk; + mdisk->disk.fn_table = &malloc_fn_table; + mdisk->disk.module = &malloc_if; + + rc = spdk_bdev_register(&mdisk->disk); + if (rc) { + malloc_disk_free(mdisk); + return NULL; + } + + TAILQ_INSERT_TAIL(&g_malloc_disks, mdisk, link); + + return &mdisk->disk; +} + +void +delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &malloc_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int bdev_malloc_initialize(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Malloc"); + int NumberOfLuns, LunSizeInMB, BlockSize, i, rc = 0; + uint64_t size; + struct spdk_bdev *bdev; + + if (sp != NULL) { + NumberOfLuns = spdk_conf_section_get_intval(sp, "NumberOfLuns"); + LunSizeInMB = spdk_conf_section_get_intval(sp, "LunSizeInMB"); + BlockSize = spdk_conf_section_get_intval(sp, "BlockSize"); + if ((NumberOfLuns < 1) || (LunSizeInMB < 1)) { + SPDK_ERRLOG("Malloc section present, but no devices specified\n"); + goto end; + } + if (BlockSize < 1) { + /* Default is 512 bytes */ + BlockSize = 512; + } + size = (uint64_t)LunSizeInMB * 1024 * 1024; + for (i = 0; i < NumberOfLuns; i++) { + bdev = create_malloc_disk(NULL, NULL, size / BlockSize, BlockSize); + if (bdev == NULL) { + SPDK_ERRLOG("Could not create malloc disk\n"); + rc = EINVAL; + goto end; + } + } + } + +end: + return rc; +} + +static void +bdev_malloc_get_spdk_running_config(FILE *fp) +{ + int num_malloc_luns = 0; + uint64_t malloc_lun_size = 0; + struct malloc_disk *mdisk; + + /* count number of malloc LUNs, get LUN size */ + TAILQ_FOREACH(mdisk, &g_malloc_disks, link) { + if (0 == malloc_lun_size) { + /* assume all malloc luns the same size */ + malloc_lun_size = mdisk->disk.blocklen * mdisk->disk.blockcnt; + malloc_lun_size /= (1024 * 1024); + } + num_malloc_luns++; + } + + if (num_malloc_luns > 0) { + fprintf(fp, + "\n" + "# Users may change this section to create a different number or size of\n" + "# malloc LUNs.\n" + "# This will generate %d LUNs with a malloc-allocated backend. Each LUN\n" + "# will be %" PRIu64 "MB in size and these will be named Malloc0 through Malloc%d.\n" + "# Not all LUNs defined here are necessarily used below.\n" + "[Malloc]\n" + " NumberOfLuns %d\n" + " LunSizeInMB %" PRIu64 "\n", + num_malloc_luns, malloc_lun_size, + num_malloc_luns - 1, num_malloc_luns, + malloc_lun_size); + } +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_malloc", SPDK_LOG_BDEV_MALLOC) diff --git a/src/spdk/lib/bdev/malloc/bdev_malloc.h b/src/spdk/lib/bdev/malloc/bdev_malloc.h new file mode 100644 index 00000000..8ebdba78 --- /dev/null +++ b/src/spdk/lib/bdev/malloc/bdev_malloc.h @@ -0,0 +1,48 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_MALLOC_H +#define SPDK_BDEV_MALLOC_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_malloc_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev *create_malloc_disk(const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size); + +void delete_malloc_disk(struct spdk_bdev *bdev, spdk_delete_malloc_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BDEV_MALLOC_H */ diff --git a/src/spdk/lib/bdev/malloc/bdev_malloc_rpc.c b/src/spdk/lib/bdev/malloc/bdev_malloc_rpc.c new file mode 100644 index 00000000..4066cf2f --- /dev/null +++ b/src/spdk/lib/bdev/malloc/bdev_malloc_rpc.c @@ -0,0 +1,170 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_malloc.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/uuid.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_construct_malloc { + char *name; + char *uuid; + uint64_t num_blocks; + uint32_t block_size; +}; + +static void +free_rpc_construct_malloc(struct rpc_construct_malloc *r) +{ + free(r->name); + free(r->uuid); +} + +static const struct spdk_json_object_decoder rpc_construct_malloc_decoders[] = { + {"name", offsetof(struct rpc_construct_malloc, name), spdk_json_decode_string, true}, + {"uuid", offsetof(struct rpc_construct_malloc, uuid), spdk_json_decode_string, true}, + {"num_blocks", offsetof(struct rpc_construct_malloc, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_construct_malloc, block_size), spdk_json_decode_uint32}, +}; + +static void +spdk_rpc_construct_malloc_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_malloc req = {NULL}; + struct spdk_json_write_ctx *w; + struct spdk_uuid *uuid = NULL; + struct spdk_uuid decoded_uuid; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_construct_malloc_decoders, + SPDK_COUNTOF(rpc_construct_malloc_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.uuid) { + if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { + goto invalid; + } + uuid = &decoded_uuid; + } + + bdev = create_malloc_disk(req.name, uuid, req.num_blocks, req.block_size); + if (bdev == NULL) { + goto invalid; + } + + free_rpc_construct_malloc(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_construct_malloc(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("construct_malloc_bdev", spdk_rpc_construct_malloc_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_malloc { + char *name; +}; + +static void +free_rpc_delete_malloc(struct rpc_delete_malloc *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_malloc_decoders[] = { + {"name", offsetof(struct rpc_delete_malloc, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_malloc_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_malloc_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_malloc req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_malloc_decoders, + SPDK_COUNTOF(rpc_delete_malloc_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_MALLOC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_INFOLOG(SPDK_LOG_BDEV_MALLOC, "bdev '%s' does not exist\n", req.name); + rc = -ENODEV; + goto invalid; + } + + delete_malloc_disk(bdev, _spdk_rpc_delete_malloc_bdev_cb, request); + + free_rpc_delete_malloc(&req); + + return; + +invalid: + free_rpc_delete_malloc(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_malloc_bdev", spdk_rpc_delete_malloc_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/null/Makefile b/src/spdk/lib/bdev/null/Makefile new file mode 100644 index 00000000..24962e58 --- /dev/null +++ b/src/spdk/lib/bdev/null/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_null.c bdev_null_rpc.c +LIBNAME = bdev_null + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/null/bdev_null.c b/src/spdk/lib/bdev/null/bdev_null.c new file mode 100644 index 00000000..9ff64725 --- /dev/null +++ b/src/spdk/lib/bdev/null/bdev_null.c @@ -0,0 +1,384 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/json.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_null.h" + +struct null_bdev { + struct spdk_bdev bdev; + TAILQ_ENTRY(null_bdev) tailq; +}; + +struct null_io_channel { + struct spdk_poller *poller; + TAILQ_HEAD(, spdk_bdev_io) io; +}; + +static TAILQ_HEAD(, null_bdev) g_null_bdev_head; +static void *g_null_read_buf; + +static int bdev_null_initialize(void); +static void bdev_null_finish(void); +static void bdev_null_get_spdk_running_config(FILE *fp); + +static struct spdk_bdev_module null_if = { + .name = "null", + .module_init = bdev_null_initialize, + .module_fini = bdev_null_finish, + .config_text = bdev_null_get_spdk_running_config, + .async_fini = true, +}; + +SPDK_BDEV_MODULE_REGISTER(&null_if) + +static int +bdev_null_destruct(void *ctx) +{ + struct null_bdev *bdev = ctx; + + TAILQ_REMOVE(&g_null_bdev_head, bdev, tailq); + free(bdev->bdev.name); + spdk_dma_free(bdev); + + return 0; +} + +static void +bdev_null_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct null_io_channel *ch = spdk_io_channel_get_ctx(_ch); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { + assert(bdev_io->u.bdev.iovcnt == 1); + bdev_io->u.bdev.iovs[0].iov_base = g_null_read_buf; + bdev_io->u.bdev.iovs[0].iov_len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; + } + TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + TAILQ_INSERT_TAIL(&ch->io, bdev_io, module_link); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } +} + +static bool +bdev_null_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_null_get_io_channel(void *ctx) +{ + return spdk_get_io_channel(&g_null_bdev_head); +} + +static void +bdev_null_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_null_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint64(w, "num_blocks", bdev->blockcnt); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table null_fn_table = { + .destruct = bdev_null_destruct, + .submit_request = bdev_null_submit_request, + .io_type_supported = bdev_null_io_type_supported, + .get_io_channel = bdev_null_get_io_channel, + .write_config_json = bdev_null_write_config_json, +}; + +struct spdk_bdev * +create_null_bdev(const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size) +{ + struct null_bdev *bdev; + int rc; + + if (block_size % 512 != 0) { + SPDK_ERRLOG("Block size %u is not a multiple of 512.\n", block_size); + return NULL; + } + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk must be more than 0 blocks\n"); + return NULL; + } + + bdev = spdk_dma_zmalloc(sizeof(*bdev), 0, NULL); + if (!bdev) { + SPDK_ERRLOG("could not allocate null_bdev\n"); + return NULL; + } + + bdev->bdev.name = strdup(name); + if (!bdev->bdev.name) { + spdk_dma_free(bdev); + return NULL; + } + bdev->bdev.product_name = "Null disk"; + + bdev->bdev.write_cache = 0; + bdev->bdev.blocklen = block_size; + bdev->bdev.blockcnt = num_blocks; + if (uuid) { + bdev->bdev.uuid = *uuid; + } else { + spdk_uuid_generate(&bdev->bdev.uuid); + } + + bdev->bdev.ctxt = bdev; + bdev->bdev.fn_table = &null_fn_table; + bdev->bdev.module = &null_if; + + rc = spdk_bdev_register(&bdev->bdev); + if (rc) { + free(bdev->bdev.name); + spdk_dma_free(bdev); + return NULL; + } + + TAILQ_INSERT_TAIL(&g_null_bdev_head, bdev, tailq); + + return &bdev->bdev; +} + +void +delete_null_bdev(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &null_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int +null_io_poll(void *arg) +{ + struct null_io_channel *ch = arg; + TAILQ_HEAD(, spdk_bdev_io) io; + struct spdk_bdev_io *bdev_io; + + TAILQ_INIT(&io); + TAILQ_SWAP(&ch->io, &io, spdk_bdev_io, module_link); + + if (TAILQ_EMPTY(&io)) { + return 0; + } + + while (!TAILQ_EMPTY(&io)) { + bdev_io = TAILQ_FIRST(&io); + TAILQ_REMOVE(&io, bdev_io, module_link); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } + + return 1; +} + +static int +null_bdev_create_cb(void *io_device, void *ctx_buf) +{ + struct null_io_channel *ch = ctx_buf; + + TAILQ_INIT(&ch->io); + ch->poller = spdk_poller_register(null_io_poll, ch, 0); + + return 0; +} + +static void +null_bdev_destroy_cb(void *io_device, void *ctx_buf) +{ + struct null_io_channel *ch = ctx_buf; + + spdk_poller_unregister(&ch->poller); +} + +static int +bdev_null_initialize(void) +{ + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Null"); + uint64_t size_in_mb, num_blocks; + int block_size, i, rc = 0; + struct spdk_bdev *bdev; + const char *name, *val; + + TAILQ_INIT(&g_null_bdev_head); + + /* + * This will be used if upper layer expects us to allocate the read buffer. + * Instead of using a real rbuf from the bdev pool, just always point to + * this same zeroed buffer. + */ + g_null_read_buf = spdk_dma_zmalloc(SPDK_BDEV_LARGE_BUF_MAX_SIZE, 0, NULL); + + /* + * We need to pick some unique address as our "io device" - so just use the + * address of the global tailq. + */ + spdk_io_device_register(&g_null_bdev_head, null_bdev_create_cb, null_bdev_destroy_cb, + sizeof(struct null_io_channel), + "null_bdev"); + + if (sp == NULL) { + goto end; + } + + i = 0; + while (true) { + val = spdk_conf_section_get_nval(sp, "Dev", i); + if (val == NULL) { + break; + } + + name = spdk_conf_section_get_nmval(sp, "Dev", i, 0); + if (name == NULL) { + SPDK_ERRLOG("Null entry %d: Name must be provided\n", i); + continue; + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 1); + if (val == NULL) { + SPDK_ERRLOG("Null entry %d: Size in MB must be provided\n", i); + continue; + } + + errno = 0; + size_in_mb = strtoull(val, NULL, 10); + if (errno) { + SPDK_ERRLOG("Null entry %d: Invalid size in MB %s\n", i, val); + continue; + } + + val = spdk_conf_section_get_nmval(sp, "Dev", i, 2); + if (val == NULL) { + block_size = 512; + } else { + errno = 0; + block_size = (int)strtol(val, NULL, 10); + if (errno) { + SPDK_ERRLOG("Null entry %d: Invalid block size %s\n", i, val); + continue; + } + } + + num_blocks = size_in_mb * (1024 * 1024) / block_size; + + bdev = create_null_bdev(name, NULL, num_blocks, block_size); + if (bdev == NULL) { + SPDK_ERRLOG("Could not create null bdev\n"); + rc = EINVAL; + goto end; + } + + i++; + } + +end: + return rc; +} + +static void +_bdev_null_finish_cb(void *arg) +{ + spdk_dma_free(g_null_read_buf); + spdk_bdev_module_finish_done(); +} + +static void +bdev_null_finish(void) +{ + spdk_io_device_unregister(&g_null_bdev_head, _bdev_null_finish_cb); +} + +static void +bdev_null_get_spdk_running_config(FILE *fp) +{ + struct null_bdev *bdev; + uint64_t null_bdev_size; + + fprintf(fp, "\n[Null]\n"); + + TAILQ_FOREACH(bdev, &g_null_bdev_head, tailq) { + null_bdev_size = bdev->bdev.blocklen * bdev->bdev.blockcnt; + null_bdev_size /= (1024 * 1024); + fprintf(fp, " %s %" PRIu64 " %d\n", + bdev->bdev.name, null_bdev_size, bdev->bdev.blocklen); + } +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_null", SPDK_LOG_BDEV_NULL) diff --git a/src/spdk/lib/bdev/null/bdev_null.h b/src/spdk/lib/bdev/null/bdev_null.h new file mode 100644 index 00000000..fa0123e3 --- /dev/null +++ b/src/spdk/lib/bdev/null/bdev_null.h @@ -0,0 +1,57 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_NULL_H +#define SPDK_BDEV_NULL_H + +#include "spdk/stdinc.h" + +typedef void (*spdk_delete_null_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev; +struct spdk_uuid; + +struct spdk_bdev *create_null_bdev(const char *name, const struct spdk_uuid *uuid, + uint64_t num_blocks, uint32_t block_size); + +/** + * Delete null bdev. + * + * \param bdev Pointer to null bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_null_bdev(struct spdk_bdev *bdev, spdk_delete_null_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_BDEV_NULL_H */ diff --git a/src/spdk/lib/bdev/null/bdev_null_rpc.c b/src/spdk/lib/bdev/null/bdev_null_rpc.c new file mode 100644 index 00000000..9410b7ad --- /dev/null +++ b/src/spdk/lib/bdev/null/bdev_null_rpc.c @@ -0,0 +1,169 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_null.h" + +struct rpc_construct_null { + char *name; + char *uuid; + uint64_t num_blocks; + uint32_t block_size; +}; + +static void +free_rpc_construct_null(struct rpc_construct_null *req) +{ + free(req->name); + free(req->uuid); +} + +static const struct spdk_json_object_decoder rpc_construct_null_decoders[] = { + {"name", offsetof(struct rpc_construct_null, name), spdk_json_decode_string}, + {"uuid", offsetof(struct rpc_construct_null, uuid), spdk_json_decode_string, true}, + {"num_blocks", offsetof(struct rpc_construct_null, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_construct_null, block_size), spdk_json_decode_uint32}, +}; + +static void +spdk_rpc_construct_null_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_null req = {}; + struct spdk_json_write_ctx *w; + struct spdk_uuid *uuid = NULL; + struct spdk_uuid decoded_uuid; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_construct_null_decoders, + SPDK_COUNTOF(rpc_construct_null_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NULL, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.uuid) { + if (spdk_uuid_parse(&decoded_uuid, req.uuid)) { + goto invalid; + } + uuid = &decoded_uuid; + } + + bdev = create_null_bdev(req.name, uuid, req.num_blocks, req.block_size); + if (bdev == NULL) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_null(&req); + return; + } + + spdk_json_write_string(w, bdev->name); + spdk_jsonrpc_end_result(request, w); + free_rpc_construct_null(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_construct_null(&req); +} +SPDK_RPC_REGISTER("construct_null_bdev", spdk_rpc_construct_null_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_null { + char *name; +}; + +static void +free_rpc_delete_null(struct rpc_delete_null *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_null_decoders[] = { + {"name", offsetof(struct rpc_delete_null, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_null_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_null_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_null req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_null_decoders, + SPDK_COUNTOF(rpc_delete_null_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + delete_null_bdev(bdev, _spdk_rpc_delete_null_bdev_cb, request); + + free_rpc_delete_null(&req); + + return; + +invalid: + free_rpc_delete_null(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_null_bdev", spdk_rpc_delete_null_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/nvme/Makefile b/src/spdk/lib/bdev/nvme/Makefile new file mode 100644 index 00000000..c5a40c74 --- /dev/null +++ b/src/spdk/lib/bdev/nvme/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_nvme.c bdev_nvme_rpc.c nvme_rpc.c +LIBNAME = bdev_nvme + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/nvme/bdev_nvme.c b/src/spdk/lib/bdev/nvme/bdev_nvme.c new file mode 100644 index 00000000..07c3b6ce --- /dev/null +++ b/src/spdk/lib/bdev/nvme/bdev_nvme.c @@ -0,0 +1,1856 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_nvme.h" + +#include "spdk/config.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/bdev.h" +#include "spdk/json.h" +#include "spdk/nvme.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static void bdev_nvme_get_spdk_running_config(FILE *fp); +static int bdev_nvme_config_json(struct spdk_json_write_ctx *w); + +struct nvme_io_channel { + struct spdk_nvme_qpair *qpair; + struct spdk_poller *poller; + + bool collect_spin_stat; + uint64_t spin_ticks; + uint64_t start_ticks; + uint64_t end_ticks; +}; + +struct nvme_bdev_io { + /** array of iovecs to transfer. */ + struct iovec *iovs; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /** Saved status for admin passthru completion event. */ + struct spdk_nvme_cpl cpl; + + /** Originating thread */ + struct spdk_thread *orig_thread; +}; + +enum data_direction { + BDEV_DISK_READ = 0, + BDEV_DISK_WRITE = 1 +}; + +struct nvme_probe_ctx { + size_t count; + struct spdk_nvme_transport_id trids[NVME_MAX_CONTROLLERS]; + const char *names[NVME_MAX_CONTROLLERS]; + const char *hostnqn; +}; + +static struct spdk_bdev_nvme_opts g_opts = { + .action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE, + .timeout_us = 0, + .retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT, + .nvme_adminq_poll_period_us = 1000000ULL, +}; + +#define NVME_HOTPLUG_POLL_PERIOD_MAX 10000000ULL +#define NVME_HOTPLUG_POLL_PERIOD_DEFAULT 100000ULL + +static int g_hot_insert_nvme_controller_index = 0; +static uint64_t g_nvme_hotplug_poll_period_us = NVME_HOTPLUG_POLL_PERIOD_DEFAULT; +static bool g_nvme_hotplug_enabled = false; +static struct spdk_thread *g_bdev_nvme_init_thread; +static struct spdk_poller *g_hotplug_poller; +static char *g_nvme_hostnqn = NULL; +static pthread_mutex_t g_bdev_nvme_mutex = PTHREAD_MUTEX_INITIALIZER; + +static TAILQ_HEAD(, nvme_ctrlr) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static int nvme_ctrlr_create_bdevs(struct nvme_ctrlr *nvme_ctrlr); +static int bdev_nvme_library_init(void); +static void bdev_nvme_library_fini(void); +static int bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair, + struct nvme_bdev_io *bio, + int direction, struct iovec *iov, int iovcnt, uint64_t lba_count, + uint64_t lba); +static int bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); +static int bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes); +static int bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len); +static int nvme_ctrlr_create_bdev(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid); + +struct spdk_nvme_qpair * +spdk_bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch) +{ + struct nvme_io_channel *nvme_ch; + + nvme_ch = spdk_io_channel_get_ctx(ctrlr_io_ch); + + return nvme_ch->qpair; +} + +struct nvme_ctrlr * +spdk_bdev_nvme_lookup_ctrlr(const char *ctrlr_name) +{ + struct nvme_ctrlr *_nvme_ctrlr; + + TAILQ_FOREACH(_nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + if (strcmp(ctrlr_name, _nvme_ctrlr->name) == 0) { + return _nvme_ctrlr; + } + } + + return NULL; +} + +struct nvme_ctrlr * +spdk_bdev_nvme_first_ctrlr(void) +{ + return TAILQ_FIRST(&g_nvme_ctrlrs); +} + +struct nvme_ctrlr * +spdk_bdev_nvme_next_ctrlr(struct nvme_ctrlr *prev) +{ + return TAILQ_NEXT(prev, tailq); +} + +static int +bdev_nvme_get_ctx_size(void) +{ + return sizeof(struct nvme_bdev_io); +} + +static struct spdk_bdev_module nvme_if = { + .name = "nvme", + .module_init = bdev_nvme_library_init, + .module_fini = bdev_nvme_library_fini, + .config_text = bdev_nvme_get_spdk_running_config, + .config_json = bdev_nvme_config_json, + .get_ctx_size = bdev_nvme_get_ctx_size, + +}; +SPDK_BDEV_MODULE_REGISTER(&nvme_if) + +static int +bdev_nvme_readv(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "read %lu blocks with offset %#lx\n", + lba_count, lba); + + return bdev_nvme_queue_cmd(nbdev, nvme_ch->qpair, bio, BDEV_DISK_READ, + iov, iovcnt, lba_count, lba); +} + +static int +bdev_nvme_writev(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct iovec *iov, int iovcnt, uint64_t lba_count, uint64_t lba) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "write %lu blocks with offset %#lx\n", + lba_count, lba); + + return bdev_nvme_queue_cmd(nbdev, nvme_ch->qpair, bio, BDEV_DISK_WRITE, + iov, iovcnt, lba_count, lba); +} + +static int +bdev_nvme_poll(void *arg) +{ + struct nvme_io_channel *ch = arg; + int32_t num_completions; + + if (ch->qpair == NULL) { + return -1; + } + + if (ch->collect_spin_stat && ch->start_ticks == 0) { + ch->start_ticks = spdk_get_ticks(); + } + + num_completions = spdk_nvme_qpair_process_completions(ch->qpair, 0); + + if (ch->collect_spin_stat) { + if (num_completions > 0) { + if (ch->end_ticks != 0) { + ch->spin_ticks += (ch->end_ticks - ch->start_ticks); + ch->end_ticks = 0; + } + ch->start_ticks = 0; + } else { + ch->end_ticks = spdk_get_ticks(); + } + } + + return num_completions; +} + +static int +bdev_nvme_poll_adminq(void *arg) +{ + struct spdk_nvme_ctrlr *ctrlr = arg; + + return spdk_nvme_ctrlr_process_admin_completions(ctrlr); +} + +static void +bdev_nvme_unregister_cb(void *io_device) +{ + struct spdk_nvme_ctrlr *ctrlr = io_device; + + spdk_nvme_detach(ctrlr); +} + +static int +bdev_nvme_destruct(void *ctx) +{ + struct nvme_bdev *nvme_disk = ctx; + struct nvme_ctrlr *nvme_ctrlr = nvme_disk->nvme_ctrlr; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + nvme_ctrlr->ref--; + free(nvme_disk->disk.name); + memset(nvme_disk, 0, sizeof(*nvme_disk)); + if (nvme_ctrlr->ref == 0) { + TAILQ_REMOVE(&g_nvme_ctrlrs, nvme_ctrlr, tailq); + pthread_mutex_unlock(&g_bdev_nvme_mutex); + spdk_io_device_unregister(nvme_ctrlr->ctrlr, bdev_nvme_unregister_cb); + spdk_poller_unregister(&nvme_ctrlr->adminq_timer_poller); + free(nvme_ctrlr->name); + free(nvme_ctrlr->bdevs); + free(nvme_ctrlr); + return 0; + } + + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return 0; + +} + +static int +bdev_nvme_flush(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio, + uint64_t offset, uint64_t nbytes) +{ + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_SUCCESS); + + return 0; +} + +static void +_bdev_nvme_reset_done(struct spdk_io_channel_iter *i, int status) +{ + void *ctx = spdk_io_channel_iter_get_ctx(i); + int rc = SPDK_BDEV_IO_STATUS_SUCCESS; + + if (status) { + rc = SPDK_BDEV_IO_STATUS_FAILED; + } + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(ctx), rc); +} + +static void +_bdev_nvme_reset_create_qpair(struct spdk_io_channel_iter *i) +{ + struct spdk_nvme_ctrlr *ctrlr = spdk_io_channel_iter_get_io_device(i); + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(_ch); + + nvme_ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0); + if (!nvme_ch->qpair) { + spdk_for_each_channel_continue(i, -1); + return; + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +_bdev_nvme_reset(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvme_ctrlr *ctrlr = spdk_io_channel_iter_get_io_device(i); + struct nvme_bdev_io *bio = spdk_io_channel_iter_get_ctx(i); + int rc; + + if (status) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc != 0) { + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(bio), SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + /* Recreate all of the I/O queue pairs */ + spdk_for_each_channel(ctrlr, + _bdev_nvme_reset_create_qpair, + bio, + _bdev_nvme_reset_done); + + +} + +static void +_bdev_nvme_reset_destroy_qpair(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + int rc; + + rc = spdk_nvme_ctrlr_free_io_qpair(nvme_ch->qpair); + if (!rc) { + nvme_ch->qpair = NULL; + } + + spdk_for_each_channel_continue(i, rc); +} + +static int +bdev_nvme_reset(struct nvme_bdev *nbdev, struct nvme_bdev_io *bio) +{ + /* First, delete all NVMe I/O queue pairs. */ + spdk_for_each_channel(nbdev->nvme_ctrlr->ctrlr, + _bdev_nvme_reset_destroy_qpair, + bio, + _bdev_nvme_reset); + + return 0; +} + +static int +bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + uint64_t offset_blocks, + uint64_t num_blocks); + +static void +bdev_nvme_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + int ret; + + ret = bdev_nvme_readv((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + if (spdk_likely(ret == 0)) { + return; + } else if (ret == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static int +_bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + if (nvme_ch->qpair == NULL) { + /* The device is currently resetting */ + return -1; + } + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_nvme_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + return bdev_nvme_writev((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks, + bdev_io->u.bdev.offset_blocks); + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return bdev_nvme_unmap((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_UNMAP: + return bdev_nvme_unmap((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_nvme_reset((struct nvme_bdev *)bdev_io->bdev->ctxt, + (struct nvme_bdev_io *)bdev_io->driver_ctx); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_nvme_flush((struct nvme_bdev *)bdev_io->bdev->ctxt, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks); + + case SPDK_BDEV_IO_TYPE_NVME_ADMIN: + return bdev_nvme_admin_passthru((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes); + + case SPDK_BDEV_IO_TYPE_NVME_IO: + return bdev_nvme_io_passthru((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes); + + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return bdev_nvme_io_passthru_md((struct nvme_bdev *)bdev_io->bdev->ctxt, + ch, + (struct nvme_bdev_io *)bdev_io->driver_ctx, + &bdev_io->u.nvme_passthru.cmd, + bdev_io->u.nvme_passthru.buf, + bdev_io->u.nvme_passthru.nbytes, + bdev_io->u.nvme_passthru.md_buf, + bdev_io->u.nvme_passthru.md_len); + + default: + return -EINVAL; + } + return 0; +} + +static void +bdev_nvme_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + int rc = _bdev_nvme_submit_request(ch, bdev_io); + + if (spdk_unlikely(rc != 0)) { + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static bool +bdev_nvme_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct nvme_bdev *nbdev = ctx; + const struct spdk_nvme_ctrlr_data *cdata; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_NVME_ADMIN: + case SPDK_BDEV_IO_TYPE_NVME_IO: + return true; + + case SPDK_BDEV_IO_TYPE_NVME_IO_MD: + return spdk_nvme_ns_get_md_size(nbdev->ns) ? true : false; + + case SPDK_BDEV_IO_TYPE_UNMAP: + cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_ctrlr->ctrlr); + return cdata->oncs.dsm; + + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + cdata = spdk_nvme_ctrlr_get_data(nbdev->nvme_ctrlr->ctrlr); + /* + * If an NVMe controller guarantees reading unallocated blocks returns zero, + * we can implement WRITE_ZEROES as an NVMe deallocate command. + */ + if (cdata->oncs.dsm && + spdk_nvme_ns_get_dealloc_logical_block_read_value(nbdev->ns) == SPDK_NVME_DEALLOC_READ_00) { + return true; + } + /* + * The NVMe controller write_zeroes function is currently not used by our driver. + * If a user submits an arbitrarily large write_zeroes request to the controller, the request will fail. + * Until this is resolved, we only claim support for write_zeroes if deallocated blocks return 0's when read. + */ + return false; + + default: + return false; + } +} + +static int +bdev_nvme_create_cb(void *io_device, void *ctx_buf) +{ + struct spdk_nvme_ctrlr *ctrlr = io_device; + struct nvme_io_channel *ch = ctx_buf; + +#ifdef SPDK_CONFIG_VTUNE + ch->collect_spin_stat = true; +#else + ch->collect_spin_stat = false; +#endif + + ch->qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0); + + if (ch->qpair == NULL) { + return -1; + } + + ch->poller = spdk_poller_register(bdev_nvme_poll, ch, 0); + return 0; +} + +static void +bdev_nvme_destroy_cb(void *io_device, void *ctx_buf) +{ + struct nvme_io_channel *ch = ctx_buf; + + spdk_nvme_ctrlr_free_io_qpair(ch->qpair); + spdk_poller_unregister(&ch->poller); +} + +static struct spdk_io_channel * +bdev_nvme_get_io_channel(void *ctx) +{ + struct nvme_bdev *nvme_bdev = ctx; + + return spdk_get_io_channel(nvme_bdev->nvme_ctrlr->ctrlr); +} + +void +spdk_bdev_nvme_dump_trid_json(struct spdk_nvme_transport_id *trid, struct spdk_json_write_ctx *w) +{ + const char *trtype_str; + const char *adrfam_str; + + trtype_str = spdk_nvme_transport_id_trtype_str(trid->trtype); + if (trtype_str) { + spdk_json_write_named_string(w, "trtype", trtype_str); + } + + adrfam_str = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam_str) { + spdk_json_write_named_string(w, "adrfam", adrfam_str); + } + + if (trid->traddr[0] != '\0') { + spdk_json_write_named_string(w, "traddr", trid->traddr); + } + + if (trid->trsvcid[0] != '\0') { + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + } + + if (trid->subnqn[0] != '\0') { + spdk_json_write_named_string(w, "subnqn", trid->subnqn); + } +} + +static int +bdev_nvme_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct nvme_bdev *nvme_bdev = ctx; + struct nvme_ctrlr *nvme_ctrlr = nvme_bdev->nvme_ctrlr; + const struct spdk_nvme_ctrlr_data *cdata; + struct spdk_nvme_ns *ns; + union spdk_nvme_vs_register vs; + union spdk_nvme_csts_register csts; + char buf[128]; + + cdata = spdk_nvme_ctrlr_get_data(nvme_bdev->nvme_ctrlr->ctrlr); + vs = spdk_nvme_ctrlr_get_regs_vs(nvme_bdev->nvme_ctrlr->ctrlr); + csts = spdk_nvme_ctrlr_get_regs_csts(nvme_bdev->nvme_ctrlr->ctrlr); + ns = nvme_bdev->ns; + + spdk_json_write_named_object_begin(w, "nvme"); + + if (nvme_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + spdk_json_write_named_string(w, "pci_address", nvme_ctrlr->trid.traddr); + } + + spdk_json_write_named_object_begin(w, "trid"); + + spdk_bdev_nvme_dump_trid_json(&nvme_ctrlr->trid, w); + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "ctrlr_data"); + + spdk_json_write_named_string_fmt(w, "vendor_id", "0x%04x", cdata->vid); + + snprintf(buf, sizeof(cdata->mn) + 1, "%s", cdata->mn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "model_number", buf); + + snprintf(buf, sizeof(cdata->sn) + 1, "%s", cdata->sn); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "serial_number", buf); + + snprintf(buf, sizeof(cdata->fr) + 1, "%s", cdata->fr); + spdk_str_trim(buf); + spdk_json_write_named_string(w, "firmware_revision", buf); + + spdk_json_write_named_object_begin(w, "oacs"); + + spdk_json_write_named_uint32(w, "security", cdata->oacs.security); + spdk_json_write_named_uint32(w, "format", cdata->oacs.format); + spdk_json_write_named_uint32(w, "firmware", cdata->oacs.firmware); + spdk_json_write_named_uint32(w, "ns_manage", cdata->oacs.ns_manage); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "vs"); + + spdk_json_write_name(w, "nvme_version"); + if (vs.bits.ter) { + spdk_json_write_string_fmt(w, "%u.%u.%u", vs.bits.mjr, vs.bits.mnr, vs.bits.ter); + } else { + spdk_json_write_string_fmt(w, "%u.%u", vs.bits.mjr, vs.bits.mnr); + } + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "csts"); + + spdk_json_write_named_uint32(w, "rdy", csts.bits.rdy); + spdk_json_write_named_uint32(w, "cfs", csts.bits.cfs); + + spdk_json_write_object_end(w); + + spdk_json_write_named_object_begin(w, "ns_data"); + + spdk_json_write_named_uint32(w, "id", spdk_nvme_ns_get_id(ns)); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_nvme_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +static uint64_t +bdev_nvme_get_spin_time(struct spdk_io_channel *ch) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + uint64_t spin_time; + + if (!nvme_ch->collect_spin_stat) { + return 0; + } + + if (nvme_ch->end_ticks != 0) { + nvme_ch->spin_ticks += (nvme_ch->end_ticks - nvme_ch->start_ticks); + nvme_ch->end_ticks = 0; + } + + spin_time = (nvme_ch->spin_ticks * 1000000ULL) / spdk_get_ticks_hz(); + nvme_ch->start_ticks = 0; + nvme_ch->spin_ticks = 0; + + return spin_time; +} + +static const struct spdk_bdev_fn_table nvmelib_fn_table = { + .destruct = bdev_nvme_destruct, + .submit_request = bdev_nvme_submit_request, + .io_type_supported = bdev_nvme_io_type_supported, + .get_io_channel = bdev_nvme_get_io_channel, + .dump_info_json = bdev_nvme_dump_info_json, + .write_config_json = bdev_nvme_write_config_json, + .get_spin_time = bdev_nvme_get_spin_time, +}; + +static int +nvme_ctrlr_create_bdev(struct nvme_ctrlr *nvme_ctrlr, uint32_t nsid) +{ + struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; + struct nvme_bdev *bdev; + struct spdk_nvme_ns *ns; + const struct spdk_uuid *uuid; + const struct spdk_nvme_ctrlr_data *cdata; + int rc; + + cdata = spdk_nvme_ctrlr_get_data(ctrlr); + + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (!ns) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Invalid NS %d\n", nsid); + return -EINVAL; + } + + bdev = &nvme_ctrlr->bdevs[nsid - 1]; + bdev->id = nsid; + + bdev->nvme_ctrlr = nvme_ctrlr; + bdev->ns = ns; + nvme_ctrlr->ref++; + + bdev->disk.name = spdk_sprintf_alloc("%sn%d", nvme_ctrlr->name, spdk_nvme_ns_get_id(ns)); + if (!bdev->disk.name) { + nvme_ctrlr->ref--; + memset(bdev, 0, sizeof(*bdev)); + return -ENOMEM; + } + bdev->disk.product_name = "NVMe disk"; + + bdev->disk.write_cache = 0; + if (cdata->vwc.present) { + /* Enable if the Volatile Write Cache exists */ + bdev->disk.write_cache = 1; + } + bdev->disk.blocklen = spdk_nvme_ns_get_extended_sector_size(ns); + bdev->disk.blockcnt = spdk_nvme_ns_get_num_sectors(ns); + bdev->disk.optimal_io_boundary = spdk_nvme_ns_get_optimal_io_boundary(ns); + + uuid = spdk_nvme_ns_get_uuid(ns); + if (uuid != NULL) { + bdev->disk.uuid = *uuid; + } + + bdev->disk.ctxt = bdev; + bdev->disk.fn_table = &nvmelib_fn_table; + bdev->disk.module = &nvme_if; + rc = spdk_bdev_register(&bdev->disk); + if (rc) { + free(bdev->disk.name); + nvme_ctrlr->ref--; + memset(bdev, 0, sizeof(*bdev)); + return rc; + } + bdev->active = true; + + return 0; +} + + +static bool +hotplug_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attaching to %s\n", trid->traddr); + + return true; +} + +static struct nvme_ctrlr * +nvme_ctrlr_get(const struct spdk_nvme_transport_id *trid) +{ + struct nvme_ctrlr *nvme_ctrlr; + + TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(trid, &nvme_ctrlr->trid) == 0) { + return nvme_ctrlr; + } + } + + return NULL; +} + +static struct nvme_ctrlr * +nvme_ctrlr_get_by_name(const char *name) +{ + struct nvme_ctrlr *nvme_ctrlr; + + if (name == NULL) { + return NULL; + } + + TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + if (strcmp(name, nvme_ctrlr->name) == 0) { + return nvme_ctrlr; + } + } + + return NULL; +} + +static bool +probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct nvme_probe_ctx *ctx = cb_ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Probing device %s\n", trid->traddr); + + if (nvme_ctrlr_get(trid)) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", + trid->traddr); + return false; + } + + if (trid->trtype == SPDK_NVME_TRANSPORT_PCIE) { + bool claim_device = false; + size_t i; + + for (i = 0; i < ctx->count; i++) { + if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { + claim_device = true; + break; + } + } + + if (!claim_device) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Not claiming device at %s\n", trid->traddr); + return false; + } + } + + if (ctx->hostnqn) { + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "%s", ctx->hostnqn); + } + + return true; +} + +static void +spdk_nvme_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("Abort failed. Resetting controller.\n"); + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + SPDK_ERRLOG("Resetting controller failed.\n"); + } + } +} + +static void +timeout_cb(void *cb_arg, struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, uint16_t cid) +{ + int rc; + union spdk_nvme_csts_register csts; + + SPDK_WARNLOG("Warning: Detected a timeout. ctrlr=%p qpair=%p cid=%u\n", ctrlr, qpair, cid); + + csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); + if (csts.bits.cfs) { + SPDK_ERRLOG("Controller Fatal Status, reset required\n"); + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + SPDK_ERRLOG("Resetting controller failed.\n"); + } + return; + } + + switch (g_opts.action_on_timeout) { + case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: + if (qpair) { + rc = spdk_nvme_ctrlr_cmd_abort(ctrlr, qpair, cid, + spdk_nvme_abort_cpl, ctrlr); + if (rc == 0) { + return; + } + + SPDK_ERRLOG("Unable to send abort. Resetting.\n"); + } + + /* FALLTHROUGH */ + case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + SPDK_ERRLOG("Resetting controller failed.\n"); + } + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: + break; + } +} + +static void +nvme_ctrlr_deactivate_bdev(struct nvme_bdev *bdev) +{ + spdk_bdev_unregister(&bdev->disk, NULL, NULL); + bdev->active = false; +} + +static void +nvme_ctrlr_update_ns_bdevs(struct nvme_ctrlr *nvme_ctrlr) +{ + struct spdk_nvme_ctrlr *ctrlr = nvme_ctrlr->ctrlr; + uint32_t i; + struct nvme_bdev *bdev; + + for (i = 0; i < nvme_ctrlr->num_ns; i++) { + uint32_t nsid = i + 1; + + bdev = &nvme_ctrlr->bdevs[i]; + if (!bdev->active && spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) { + SPDK_NOTICELOG("NSID %u to be added\n", nsid); + nvme_ctrlr_create_bdev(nvme_ctrlr, nsid); + } + + if (bdev->active && !spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) { + SPDK_NOTICELOG("NSID %u Bdev %s is removed\n", nsid, bdev->disk.name); + nvme_ctrlr_deactivate_bdev(bdev); + } + } + +} + +static void +aer_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_ctrlr *nvme_ctrlr = arg; + union spdk_nvme_async_event_completion event; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("AER request execute failed"); + return; + } + + event.raw = cpl->cdw0; + if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && + (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { + nvme_ctrlr_update_ns_bdevs(nvme_ctrlr); + } +} + +static int +create_ctrlr(struct spdk_nvme_ctrlr *ctrlr, + const char *name, + const struct spdk_nvme_transport_id *trid) +{ + struct nvme_ctrlr *nvme_ctrlr; + + nvme_ctrlr = calloc(1, sizeof(*nvme_ctrlr)); + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("Failed to allocate device struct\n"); + return -ENOMEM; + } + nvme_ctrlr->num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + nvme_ctrlr->bdevs = calloc(nvme_ctrlr->num_ns, sizeof(struct nvme_bdev)); + if (!nvme_ctrlr->bdevs) { + SPDK_ERRLOG("Failed to allocate block devices struct\n"); + free(nvme_ctrlr); + return -ENOMEM; + } + + nvme_ctrlr->adminq_timer_poller = NULL; + nvme_ctrlr->ctrlr = ctrlr; + nvme_ctrlr->ref = 0; + nvme_ctrlr->trid = *trid; + nvme_ctrlr->name = strdup(name); + if (nvme_ctrlr->name == NULL) { + free(nvme_ctrlr->bdevs); + free(nvme_ctrlr); + return -ENOMEM; + } + + spdk_io_device_register(ctrlr, bdev_nvme_create_cb, bdev_nvme_destroy_cb, + sizeof(struct nvme_io_channel), + name); + + if (nvme_ctrlr_create_bdevs(nvme_ctrlr) != 0) { + spdk_io_device_unregister(ctrlr, bdev_nvme_unregister_cb); + free(nvme_ctrlr->bdevs); + free(nvme_ctrlr->name); + free(nvme_ctrlr); + return -1; + } + + nvme_ctrlr->adminq_timer_poller = spdk_poller_register(bdev_nvme_poll_adminq, ctrlr, + g_opts.nvme_adminq_poll_period_us); + + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, nvme_ctrlr, tailq); + + if (g_opts.timeout_us > 0 && g_opts.action_on_timeout != SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE) { + spdk_nvme_ctrlr_register_timeout_callback(ctrlr, g_opts.timeout_us, + timeout_cb, NULL); + } + + spdk_nvme_ctrlr_register_aer_callback(ctrlr, aer_cb, nvme_ctrlr); + + return 0; +} + +static void +attach_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr *ctrlr, const struct spdk_nvme_ctrlr_opts *opts) +{ + struct nvme_probe_ctx *ctx = cb_ctx; + char *name = NULL; + size_t i; + + if (ctx) { + for (i = 0; i < ctx->count; i++) { + if (spdk_nvme_transport_id_compare(trid, &ctx->trids[i]) == 0) { + name = strdup(ctx->names[i]); + break; + } + } + } else { + name = spdk_sprintf_alloc("HotInNvme%d", g_hot_insert_nvme_controller_index++); + } + if (!name) { + SPDK_ERRLOG("Failed to assign name to NVMe device\n"); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_NVME, "Attached to %s (%s)\n", trid->traddr, name); + + create_ctrlr(ctrlr, name, trid); + + free(name); +} + +static void +remove_cb(void *cb_ctx, struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t i; + struct nvme_ctrlr *nvme_ctrlr; + struct nvme_bdev *nvme_bdev; + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + if (nvme_ctrlr->ctrlr == ctrlr) { + pthread_mutex_unlock(&g_bdev_nvme_mutex); + for (i = 0; i < nvme_ctrlr->num_ns; i++) { + uint32_t nsid = i + 1; + + nvme_bdev = &nvme_ctrlr->bdevs[nsid - 1]; + assert(nvme_bdev->id == nsid); + if (nvme_bdev->active) { + spdk_bdev_unregister(&nvme_bdev->disk, NULL, NULL); + } + } + return; + } + } + pthread_mutex_unlock(&g_bdev_nvme_mutex); +} + +static int +bdev_nvme_hotplug(void *arg) +{ + if (spdk_nvme_probe(NULL, NULL, hotplug_probe_cb, attach_cb, remove_cb) != 0) { + SPDK_ERRLOG("spdk_nvme_probe() failed\n"); + } + + return -1; +} + +void +spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts) +{ + *opts = g_opts; +} + +int +spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts) +{ + if (g_bdev_nvme_init_thread != NULL) { + return -EPERM; + } + + g_opts = *opts; + + return 0; +} +struct set_nvme_hotplug_ctx { + uint64_t period_us; + bool enabled; + spdk_thread_fn fn; + void *fn_ctx; +}; + +static void +set_nvme_hotplug_period_cb(void *_ctx) +{ + struct set_nvme_hotplug_ctx *ctx = _ctx; + + spdk_poller_unregister(&g_hotplug_poller); + if (ctx->enabled) { + g_hotplug_poller = spdk_poller_register(bdev_nvme_hotplug, NULL, ctx->period_us); + } + + g_nvme_hotplug_poll_period_us = ctx->period_us; + g_nvme_hotplug_enabled = ctx->enabled; + if (ctx->fn) { + ctx->fn(ctx->fn_ctx); + } + + free(ctx); +} + +int +spdk_bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_thread_fn cb, void *cb_ctx) +{ + struct set_nvme_hotplug_ctx *ctx; + + if (enabled == true && !spdk_process_is_primary()) { + return -EPERM; + } + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + return -ENOMEM; + } + + period_us = period_us == 0 ? NVME_HOTPLUG_POLL_PERIOD_DEFAULT : period_us; + ctx->period_us = spdk_min(period_us, NVME_HOTPLUG_POLL_PERIOD_MAX); + ctx->enabled = enabled; + ctx->fn = cb; + ctx->fn_ctx = cb_ctx; + + spdk_thread_send_msg(g_bdev_nvme_init_thread, set_nvme_hotplug_period_cb, ctx); + return 0; +} + +int +spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, + const char *base_name, + const char **names, size_t *count, + const char *hostnqn) +{ + struct nvme_probe_ctx *probe_ctx; + struct nvme_ctrlr *nvme_ctrlr; + struct nvme_bdev *nvme_bdev; + uint32_t i, nsid; + size_t j; + + if (nvme_ctrlr_get(trid) != NULL) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", trid->traddr); + return -1; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (probe_ctx == NULL) { + SPDK_ERRLOG("Failed to allocate probe_ctx\n"); + return -1; + } + + probe_ctx->count = 1; + probe_ctx->trids[0] = *trid; + probe_ctx->names[0] = base_name; + probe_ctx->hostnqn = hostnqn; + if (spdk_nvme_probe(trid, probe_ctx, probe_cb, attach_cb, NULL)) { + SPDK_ERRLOG("Failed to probe for new devices\n"); + free(probe_ctx); + return -1; + } + + nvme_ctrlr = nvme_ctrlr_get(trid); + if (!nvme_ctrlr) { + SPDK_ERRLOG("Failed to find new NVMe controller\n"); + free(probe_ctx); + return -1; + } + + /* + * Report the new bdevs that were created in this call. + * There can be more than one bdev per NVMe controller since one bdev is created per namespace. + */ + j = 0; + for (i = 0; i < nvme_ctrlr->num_ns; i++) { + nsid = i + 1; + nvme_bdev = &nvme_ctrlr->bdevs[nsid - 1]; + if (!nvme_bdev->active) { + continue; + } + assert(nvme_bdev->id == nsid); + if (j < *count) { + names[j] = nvme_bdev->disk.name; + j++; + } else { + SPDK_ERRLOG("Maximum number of namespaces supported per NVMe controller is %zu. Unable to return all names of created bdevs\n", + *count); + free(probe_ctx); + return -1; + } + } + + *count = j; + + free(probe_ctx); + return 0; +} + +int +spdk_bdev_nvme_delete(const char *name) +{ + struct nvme_ctrlr *nvme_ctrlr = NULL; + + if (name == NULL) { + return -EINVAL; + } + + nvme_ctrlr = nvme_ctrlr_get_by_name(name); + if (nvme_ctrlr == NULL) { + SPDK_ERRLOG("Failed to find NVMe controller\n"); + return -ENODEV; + } + + remove_cb(NULL, nvme_ctrlr->ctrlr); + return 0; +} + +static int +bdev_nvme_library_init(void) +{ + struct spdk_conf_section *sp; + const char *val; + int rc = 0; + int64_t intval = 0; + size_t i; + struct nvme_probe_ctx *probe_ctx = NULL; + int retry_count; + uint32_t local_nvme_num = 0; + int64_t hotplug_period; + bool hotplug_enabled = g_nvme_hotplug_enabled; + + g_bdev_nvme_init_thread = spdk_get_thread(); + + sp = spdk_conf_find_section(NULL, "Nvme"); + if (sp == NULL) { + goto end; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (probe_ctx == NULL) { + SPDK_ERRLOG("Failed to allocate probe_ctx\n"); + rc = -1; + goto end; + } + + if ((retry_count = spdk_conf_section_get_intval(sp, "RetryCount")) < 0) { + if ((retry_count = spdk_conf_section_get_intval(sp, "NvmeRetryCount")) < 0) { + retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT; + } else { + SPDK_WARNLOG("NvmeRetryCount was renamed to RetryCount\n"); + SPDK_WARNLOG("Please update your configuration file\n"); + } + } + + g_opts.retry_count = retry_count; + + val = spdk_conf_section_get_val(sp, "TimeoutUsec"); + if (val != NULL) { + intval = strtoll(val, NULL, 10); + if (intval == LLONG_MIN || intval == LLONG_MAX) { + SPDK_ERRLOG("Invalid TimeoutUsec value\n"); + rc = -1; + goto end; + } else if (intval < 0) { + intval = 0; + } + } + + g_opts.timeout_us = intval; + + if (g_opts.timeout_us > 0) { + val = spdk_conf_section_get_val(sp, "ActionOnTimeout"); + if (val != NULL) { + if (!strcasecmp(val, "Reset")) { + g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; + } else if (!strcasecmp(val, "Abort")) { + g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; + } + } else { + /* Handle old name for backward compatibility */ + val = spdk_conf_section_get_val(sp, "ResetControllerOnTimeout"); + if (val) { + SPDK_WARNLOG("ResetControllerOnTimeout was renamed to ActionOnTimeout\n"); + SPDK_WARNLOG("Please update your configuration file\n"); + + if (spdk_conf_section_get_boolval(sp, "ResetControllerOnTimeout", false)) { + g_opts.action_on_timeout = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; + } + } + } + } + + intval = spdk_conf_section_get_intval(sp, "AdminPollRate"); + if (intval > 0) { + g_opts.nvme_adminq_poll_period_us = intval; + } + + if (spdk_process_is_primary()) { + hotplug_enabled = spdk_conf_section_get_boolval(sp, "HotplugEnable", false); + } + + hotplug_period = spdk_conf_section_get_intval(sp, "HotplugPollRate"); + + g_nvme_hostnqn = spdk_conf_section_get_val(sp, "HostNQN"); + probe_ctx->hostnqn = g_nvme_hostnqn; + + for (i = 0; i < NVME_MAX_CONTROLLERS; i++) { + val = spdk_conf_section_get_nmval(sp, "TransportID", i, 0); + if (val == NULL) { + break; + } + + rc = spdk_nvme_transport_id_parse(&probe_ctx->trids[i], val); + if (rc < 0) { + SPDK_ERRLOG("Unable to parse TransportID: %s\n", val); + rc = -1; + goto end; + } + + val = spdk_conf_section_get_nmval(sp, "TransportID", i, 1); + if (val == NULL) { + SPDK_ERRLOG("No name provided for TransportID\n"); + rc = -1; + goto end; + } + + probe_ctx->names[i] = val; + probe_ctx->count++; + + if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ctrlr_opts opts; + + if (nvme_ctrlr_get(&probe_ctx->trids[i])) { + SPDK_ERRLOG("A controller with the provided trid (traddr: %s) already exists.\n", + probe_ctx->trids[i].traddr); + rc = -1; + goto end; + } + + if (probe_ctx->trids[i].subnqn[0] == '\0') { + SPDK_ERRLOG("Need to provide subsystem nqn\n"); + rc = -1; + goto end; + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + + if (probe_ctx->hostnqn != NULL) { + snprintf(opts.hostnqn, sizeof(opts.hostnqn), "%s", probe_ctx->hostnqn); + } + + ctrlr = spdk_nvme_connect(&probe_ctx->trids[i], &opts, sizeof(opts)); + if (ctrlr == NULL) { + SPDK_ERRLOG("Unable to connect to provided trid (traddr: %s)\n", + probe_ctx->trids[i].traddr); + rc = -1; + goto end; + } + + rc = create_ctrlr(ctrlr, probe_ctx->names[i], &probe_ctx->trids[i]); + if (rc) { + goto end; + } + } else { + local_nvme_num++; + } + } + + if (local_nvme_num > 0) { + /* used to probe local NVMe device */ + if (spdk_nvme_probe(NULL, probe_ctx, probe_cb, attach_cb, NULL)) { + rc = -1; + goto end; + } + + for (i = 0; i < probe_ctx->count; i++) { + if (probe_ctx->trids[i].trtype != SPDK_NVME_TRANSPORT_PCIE) { + continue; + } + + if (!nvme_ctrlr_get(&probe_ctx->trids[i])) { + SPDK_ERRLOG("NVMe SSD \"%s\" could not be found.\n", probe_ctx->trids[i].traddr); + SPDK_ERRLOG("Check PCIe BDF and that it is attached to UIO/VFIO driver.\n"); + } + } + } + + rc = spdk_bdev_nvme_set_hotplug(hotplug_enabled, hotplug_period, NULL, NULL); + if (rc) { + SPDK_ERRLOG("Failed to setup hotplug (%d): %s", rc, spdk_strerror(rc)); + rc = -1; + } +end: + spdk_nvme_retry_count = g_opts.retry_count; + + free(probe_ctx); + return rc; +} + +static void +bdev_nvme_library_fini(void) +{ + spdk_poller_unregister(&g_hotplug_poller); +} + +static int +nvme_ctrlr_create_bdevs(struct nvme_ctrlr *nvme_ctrlr) +{ + int rc; + int bdev_created = 0; + uint32_t nsid; + + for (nsid = spdk_nvme_ctrlr_get_first_active_ns(nvme_ctrlr->ctrlr); + nsid != 0; nsid = spdk_nvme_ctrlr_get_next_active_ns(nvme_ctrlr->ctrlr, nsid)) { + rc = nvme_ctrlr_create_bdev(nvme_ctrlr, nsid); + if (rc == 0) { + bdev_created++; + } + } + + return (bdev_created > 0) ? 0 : -1; +} + +static void +bdev_nvme_queued_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx((struct nvme_bdev_io *)ref); + + spdk_bdev_io_complete_nvme_status(bdev_io, cpl->status.sct, cpl->status.sc); +} + +static void +bdev_nvme_admin_passthru_completion(void *ctx) +{ + struct nvme_bdev_io *bio = ctx; + struct spdk_bdev_io *bdev_io = spdk_bdev_io_from_ctx(bio); + + spdk_bdev_io_complete_nvme_status(bdev_io, + bio->cpl.status.sct, bio->cpl.status.sc); +} + +static void +bdev_nvme_admin_passthru_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_bdev_io *bio = ref; + + bio->cpl = *cpl; + spdk_thread_send_msg(bio->orig_thread, bdev_nvme_admin_passthru_completion, bio); +} + +static void +bdev_nvme_queued_reset_sgl(void *ref, uint32_t sgl_offset) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + bio->iov_offset = sgl_offset; + for (bio->iovpos = 0; bio->iovpos < bio->iovcnt; bio->iovpos++) { + iov = &bio->iovs[bio->iovpos]; + if (bio->iov_offset < iov->iov_len) { + break; + } + + bio->iov_offset -= iov->iov_len; + } +} + +static int +bdev_nvme_queued_next_sge(void *ref, void **address, uint32_t *length) +{ + struct nvme_bdev_io *bio = ref; + struct iovec *iov; + + assert(bio->iovpos < bio->iovcnt); + + iov = &bio->iovs[bio->iovpos]; + + *address = iov->iov_base; + *length = iov->iov_len; + + if (bio->iov_offset) { + assert(bio->iov_offset <= iov->iov_len); + *address += bio->iov_offset; + *length -= bio->iov_offset; + } + + bio->iov_offset += *length; + if (bio->iov_offset == iov->iov_len) { + bio->iovpos++; + bio->iov_offset = 0; + } + + return 0; +} + +static int +bdev_nvme_queue_cmd(struct nvme_bdev *bdev, struct spdk_nvme_qpair *qpair, + struct nvme_bdev_io *bio, + int direction, struct iovec *iov, int iovcnt, uint64_t lba_count, + uint64_t lba) +{ + int rc; + + bio->iovs = iov; + bio->iovcnt = iovcnt; + bio->iovpos = 0; + bio->iov_offset = 0; + + if (direction == BDEV_DISK_READ) { + rc = spdk_nvme_ns_cmd_readv(bdev->ns, qpair, lba, + lba_count, bdev_nvme_queued_done, bio, 0, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); + } else { + rc = spdk_nvme_ns_cmd_writev(bdev->ns, qpair, lba, + lba_count, bdev_nvme_queued_done, bio, 0, + bdev_nvme_queued_reset_sgl, bdev_nvme_queued_next_sge); + } + + if (rc != 0 && rc != -ENOMEM) { + SPDK_ERRLOG("%s failed: rc = %d\n", direction == BDEV_DISK_READ ? "readv" : "writev", rc); + } + return rc; +} + +static int +bdev_nvme_unmap(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + uint64_t offset_blocks, + uint64_t num_blocks) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + struct spdk_nvme_dsm_range dsm_ranges[SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES]; + struct spdk_nvme_dsm_range *range; + uint64_t offset, remaining; + uint64_t num_ranges_u64; + uint16_t num_ranges; + int rc; + + num_ranges_u64 = (num_blocks + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS - 1) / + SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + if (num_ranges_u64 > SPDK_COUNTOF(dsm_ranges)) { + SPDK_ERRLOG("Unmap request for %" PRIu64 " blocks is too large\n", num_blocks); + return -EINVAL; + } + num_ranges = (uint16_t)num_ranges_u64; + + offset = offset_blocks; + remaining = num_blocks; + range = &dsm_ranges[0]; + + /* Fill max-size ranges until the remaining blocks fit into one range */ + while (remaining > SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS) { + range->attributes.raw = 0; + range->length = SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + range->starting_lba = offset; + + offset += SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + remaining -= SPDK_NVME_DATASET_MANAGEMENT_RANGE_MAX_BLOCKS; + range++; + } + + /* Final range describes the remaining blocks */ + range->attributes.raw = 0; + range->length = remaining; + range->starting_lba = offset; + + rc = spdk_nvme_ns_cmd_dataset_management(nbdev->ns, nvme_ch->qpair, + SPDK_NVME_DSM_ATTR_DEALLOCATE, + dsm_ranges, num_ranges, + bdev_nvme_queued_done, bio); + + return rc; +} + +static int +bdev_nvme_admin_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) +{ + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + bio->orig_thread = spdk_io_channel_get_thread(ch); + + return spdk_nvme_ctrlr_cmd_admin_raw(nbdev->nvme_ctrlr->ctrlr, cmd, buf, + (uint32_t)nbytes, bdev_nvme_admin_passthru_done, bio); +} + +static int +bdev_nvme_io_passthru(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + /* + * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, + * so fill it out automatically. + */ + cmd->nsid = spdk_nvme_ns_get_id(nbdev->ns); + + return spdk_nvme_ctrlr_cmd_io_raw(nbdev->nvme_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, + (uint32_t)nbytes, bdev_nvme_queued_done, bio); +} + +static int +bdev_nvme_io_passthru_md(struct nvme_bdev *nbdev, struct spdk_io_channel *ch, + struct nvme_bdev_io *bio, + struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len) +{ + struct nvme_io_channel *nvme_ch = spdk_io_channel_get_ctx(ch); + size_t nr_sectors = nbytes / spdk_nvme_ns_get_extended_sector_size(nbdev->ns); + uint32_t max_xfer_size = spdk_nvme_ctrlr_get_max_xfer_size(nbdev->nvme_ctrlr->ctrlr); + + if (nbytes > max_xfer_size) { + SPDK_ERRLOG("nbytes is greater than MDTS %" PRIu32 ".\n", max_xfer_size); + return -EINVAL; + } + + if (md_len != nr_sectors * spdk_nvme_ns_get_md_size(nbdev->ns)) { + SPDK_ERRLOG("invalid meta data buffer size\n"); + return -EINVAL; + } + + /* + * Each NVMe bdev is a specific namespace, and all NVMe I/O commands require a nsid, + * so fill it out automatically. + */ + cmd->nsid = spdk_nvme_ns_get_id(nbdev->ns); + + return spdk_nvme_ctrlr_cmd_io_raw_with_md(nbdev->nvme_ctrlr->ctrlr, nvme_ch->qpair, cmd, buf, + (uint32_t)nbytes, md_buf, bdev_nvme_queued_done, bio); +} + +static void +bdev_nvme_get_spdk_running_config(FILE *fp) +{ + struct nvme_ctrlr *nvme_ctrlr; + + fprintf(fp, "\n[Nvme]"); + fprintf(fp, "\n" + "# NVMe Device Whitelist\n" + "# Users may specify which NVMe devices to claim by their transport id.\n" + "# See spdk_nvme_transport_id_parse() in spdk/nvme.h for the correct format.\n" + "# The second argument is the assigned name, which can be referenced from\n" + "# other sections in the configuration file. For NVMe devices, a namespace\n" + "# is automatically appended to each name in the format nY, where\n" + "# Y is the NSID (starts at 1).\n"); + + TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + const char *trtype; + + trtype = spdk_nvme_transport_id_trtype_str(nvme_ctrlr->trid.trtype); + if (!trtype) { + continue; + } + + if (nvme_ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + fprintf(fp, "TransportID \"trtype:%s traddr:%s\" %s\n", + trtype, + nvme_ctrlr->trid.traddr, nvme_ctrlr->name); + } else { + const char *adrfam; + + adrfam = spdk_nvme_transport_id_adrfam_str(nvme_ctrlr->trid.adrfam); + + if (adrfam) { + fprintf(fp, "TransportID \"trtype:%s adrfam:%s traddr:%s trsvcid:%s subnqn:%s\" %s\n", + trtype, adrfam, + nvme_ctrlr->trid.traddr, nvme_ctrlr->trid.trsvcid, + nvme_ctrlr->trid.subnqn, nvme_ctrlr->name); + } else { + fprintf(fp, "TransportID \"trtype:%s traddr:%s trsvcid:%s subnqn:%s\" %s\n", + trtype, + nvme_ctrlr->trid.traddr, nvme_ctrlr->trid.trsvcid, + nvme_ctrlr->trid.subnqn, nvme_ctrlr->name); + } + + } + } + + fprintf(fp, "\n" + "# The number of attempts per I/O when an I/O fails. Do not include\n" + "# this key to get the default behavior.\n"); + fprintf(fp, "RetryCount %d\n", spdk_nvme_retry_count); + fprintf(fp, "\n" + "# Timeout for each command, in microseconds. If 0, don't track timeouts.\n"); + fprintf(fp, "TimeoutUsec %"PRIu64"\n", g_opts.timeout_us); + + fprintf(fp, "\n" + "# Action to take on command time out. Only valid when Timeout is greater\n" + "# than 0. This may be 'Reset' to reset the controller, 'Abort' to abort\n" + "# the command, or 'None' to just print a message but do nothing.\n" + "# Admin command timeouts will always result in a reset.\n"); + switch (g_opts.action_on_timeout) { + case SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE: + fprintf(fp, "ActionOnTimeout None\n"); + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET: + fprintf(fp, "ActionOnTimeout Reset\n"); + break; + case SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT: + fprintf(fp, "ActionOnTimeout Abort\n"); + break; + } + + fprintf(fp, "\n" + "# Set how often the admin queue is polled for asynchronous events.\n" + "# Units in microseconds.\n"); + fprintf(fp, "AdminPollRate %"PRIu64"\n", g_opts.nvme_adminq_poll_period_us); + fprintf(fp, "\n" + "# Disable handling of hotplug (runtime insert and remove) events,\n" + "# users can set to Yes if want to enable it.\n" + "# Default: No\n"); + fprintf(fp, "HotplugEnable %s\n", g_nvme_hotplug_enabled ? "Yes" : "No"); + fprintf(fp, "\n" + "# Set how often the hotplug is processed for insert and remove events." + "# Units in microseconds.\n"); + fprintf(fp, "HotplugPollRate %"PRIu64"\n", g_nvme_hotplug_poll_period_us); + if (g_nvme_hostnqn) { + fprintf(fp, "HostNQN %s\n", g_nvme_hostnqn); + } + + fprintf(fp, "\n"); +} + +static int +bdev_nvme_config_json(struct spdk_json_write_ctx *w) +{ + struct nvme_ctrlr *nvme_ctrlr; + struct spdk_nvme_transport_id *trid; + const char *action; + + if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET) { + action = "reset"; + } else if (g_opts.action_on_timeout == SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT) { + action = "abort"; + } else { + action = "none"; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "set_bdev_nvme_options"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "action_on_timeout", action); + spdk_json_write_named_uint64(w, "timeout_us", g_opts.timeout_us); + spdk_json_write_named_uint32(w, "retry_count", g_opts.retry_count); + spdk_json_write_named_uint64(w, "nvme_adminq_poll_period_us", g_opts.nvme_adminq_poll_period_us); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + pthread_mutex_lock(&g_bdev_nvme_mutex); + TAILQ_FOREACH(nvme_ctrlr, &g_nvme_ctrlrs, tailq) { + trid = &nvme_ctrlr->trid; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_nvme_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", nvme_ctrlr->name); + spdk_bdev_nvme_dump_trid_json(trid, w); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + /* Dump as last parameter to give all NVMe bdevs chance to be constructed + * before enabling hotplug poller. + */ + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_bdev_nvme_hotplug"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint64(w, "period_us", g_nvme_hotplug_poll_period_us); + spdk_json_write_named_bool(w, "enable", g_nvme_hotplug_enabled); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + pthread_mutex_unlock(&g_bdev_nvme_mutex); + return 0; +} + +struct spdk_nvme_ctrlr * +spdk_bdev_nvme_get_ctrlr(struct spdk_bdev *bdev) +{ + if (!bdev || bdev->module != &nvme_if) { + return NULL; + } + + return SPDK_CONTAINEROF(bdev, struct nvme_bdev, disk)->nvme_ctrlr->ctrlr; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_nvme", SPDK_LOG_BDEV_NVME) diff --git a/src/spdk/lib/bdev/nvme/bdev_nvme.h b/src/spdk/lib/bdev/nvme/bdev_nvme.h new file mode 100644 index 00000000..b8c458e8 --- /dev/null +++ b/src/spdk/lib/bdev/nvme/bdev_nvme.h @@ -0,0 +1,112 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_NVME_H +#define SPDK_BDEV_NVME_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/nvme.h" +#include "spdk/bdev_module.h" + +#define NVME_MAX_CONTROLLERS 1024 + +enum spdk_bdev_timeout_action { + SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE = 0, + SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET, + SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT, +}; + +struct spdk_bdev_nvme_opts { + enum spdk_bdev_timeout_action action_on_timeout; + uint64_t timeout_us; + uint32_t retry_count; + uint64_t nvme_adminq_poll_period_us; +}; + +struct nvme_ctrlr { + /** + * points to pinned, physically contiguous memory region; + * contains 4KB IDENTIFY structure for controller which is + * target for CONTROLLER IDENTIFY command during initialization + */ + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_transport_id trid; + char *name; + int ref; + uint32_t num_ns; + /** Array of bdevs indexed by nsid - 1 */ + struct nvme_bdev *bdevs; + + struct spdk_poller *adminq_timer_poller; + + /** linked list pointer for device list */ + TAILQ_ENTRY(nvme_ctrlr) tailq; +}; + +struct nvme_bdev { + struct spdk_bdev disk; + struct nvme_ctrlr *nvme_ctrlr; + uint32_t id; + bool active; + struct spdk_nvme_ns *ns; +}; + +void spdk_bdev_nvme_dump_trid_json(struct spdk_nvme_transport_id *trid, + struct spdk_json_write_ctx *w); + +struct spdk_nvme_qpair *spdk_bdev_nvme_get_io_qpair(struct spdk_io_channel *ctrlr_io_ch); +struct nvme_ctrlr *spdk_bdev_nvme_lookup_ctrlr(const char *ctrlr_name); +struct nvme_ctrlr *spdk_bdev_nvme_first_ctrlr(void); +struct nvme_ctrlr *spdk_bdev_nvme_next_ctrlr(struct nvme_ctrlr *prev); +void spdk_bdev_nvme_get_opts(struct spdk_bdev_nvme_opts *opts); +int spdk_bdev_nvme_set_opts(const struct spdk_bdev_nvme_opts *opts); +int spdk_bdev_nvme_set_hotplug(bool enabled, uint64_t period_us, spdk_thread_fn cb, void *cb_ctx); + +int spdk_bdev_nvme_create(struct spdk_nvme_transport_id *trid, + const char *base_name, + const char **names, size_t *count, + const char *hostnqn); +struct spdk_nvme_ctrlr *spdk_bdev_nvme_get_ctrlr(struct spdk_bdev *bdev); + +/** + * Delete NVMe controller with all bdevs on top of it. + * Requires to pass name of NVMe controller. + * + * \param name NVMe controller name + * \return zero on success, -EINVAL on wrong parameters or -ENODEV if controller is not found + */ +int spdk_bdev_nvme_delete(const char *name); + +#endif // SPDK_BDEV_NVME_H diff --git a/src/spdk/lib/bdev/nvme/bdev_nvme_rpc.c b/src/spdk/lib/bdev/nvme/bdev_nvme_rpc.c new file mode 100644 index 00000000..0312a756 --- /dev/null +++ b/src/spdk/lib/bdev/nvme/bdev_nvme_rpc.c @@ -0,0 +1,740 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_nvme.h" + +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" +#include "spdk/bdev_module.h" + +struct open_descriptors { + void *desc; + struct spdk_bdev *bdev; + TAILQ_ENTRY(open_descriptors) tqlst; +}; +typedef TAILQ_HEAD(, open_descriptors) open_descriptors_t; + +static int +rpc_decode_action_on_timeout(const struct spdk_json_val *val, void *out) +{ + enum spdk_bdev_timeout_action *action = out; + + if (spdk_json_strequal(val, "none") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_NONE; + } else if (spdk_json_strequal(val, "abort") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_ABORT; + } else if (spdk_json_strequal(val, "reset") == true) { + *action = SPDK_BDEV_NVME_TIMEOUT_ACTION_RESET; + } else { + SPDK_NOTICELOG("Invalid parameter value: action_on_timeout\n"); + return -EINVAL; + } + + return 0; +} + +static const struct spdk_json_object_decoder rpc_bdev_nvme_options_decoders[] = { + {"action_on_timeout", offsetof(struct spdk_bdev_nvme_opts, action_on_timeout), rpc_decode_action_on_timeout, true}, + {"timeout_us", offsetof(struct spdk_bdev_nvme_opts, timeout_us), spdk_json_decode_uint64, true}, + {"retry_count", offsetof(struct spdk_bdev_nvme_opts, retry_count), spdk_json_decode_uint32, true}, + {"nvme_adminq_poll_period_us", offsetof(struct spdk_bdev_nvme_opts, nvme_adminq_poll_period_us), spdk_json_decode_uint64, true}, +}; + +static void +spdk_rpc_set_bdev_nvme_options(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_bdev_nvme_opts opts; + struct spdk_json_write_ctx *w; + int rc; + + spdk_bdev_nvme_get_opts(&opts); + if (params && spdk_json_decode_object(params, rpc_bdev_nvme_options_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_options_decoders), + &opts)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_bdev_nvme_set_opts(&opts); + if (rc) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } + + return; +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("set_bdev_nvme_options", spdk_rpc_set_bdev_nvme_options, SPDK_RPC_STARTUP) + +struct rpc_bdev_nvme_hotplug { + bool enabled; + uint64_t period_us; +}; + +static const struct spdk_json_object_decoder rpc_bdev_nvme_hotplug_decoders[] = { + {"enable", offsetof(struct rpc_bdev_nvme_hotplug, enabled), spdk_json_decode_bool, false}, + {"period_us", offsetof(struct rpc_bdev_nvme_hotplug, period_us), spdk_json_decode_uint64, true}, +}; + +static void +rpc_set_bdev_nvme_hotplug_done(void *ctx) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w = spdk_jsonrpc_begin_result(request); + + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } +} + +static void +spdk_rpc_set_bdev_nvme_hotplug(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_bdev_nvme_hotplug req = {false, 0}; + int rc; + + if (spdk_json_decode_object(params, rpc_bdev_nvme_hotplug_decoders, + SPDK_COUNTOF(rpc_bdev_nvme_hotplug_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_bdev_nvme_set_hotplug(req.enabled, req.period_us, rpc_set_bdev_nvme_hotplug_done, + request); + if (rc) { + goto invalid; + } + + return; +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("set_bdev_nvme_hotplug", spdk_rpc_set_bdev_nvme_hotplug, SPDK_RPC_RUNTIME) + +struct rpc_construct_nvme { + char *name; + char *trtype; + char *adrfam; + char *traddr; + char *trsvcid; + char *subnqn; + char *hostnqn; +}; + +static void +free_rpc_construct_nvme(struct rpc_construct_nvme *req) +{ + free(req->name); + free(req->trtype); + free(req->adrfam); + free(req->traddr); + free(req->trsvcid); + free(req->subnqn); + free(req->hostnqn); +} + +static const struct spdk_json_object_decoder rpc_construct_nvme_decoders[] = { + {"name", offsetof(struct rpc_construct_nvme, name), spdk_json_decode_string}, + {"trtype", offsetof(struct rpc_construct_nvme, trtype), spdk_json_decode_string}, + {"traddr", offsetof(struct rpc_construct_nvme, traddr), spdk_json_decode_string}, + + {"adrfam", offsetof(struct rpc_construct_nvme, adrfam), spdk_json_decode_string, true}, + {"trsvcid", offsetof(struct rpc_construct_nvme, trsvcid), spdk_json_decode_string, true}, + {"subnqn", offsetof(struct rpc_construct_nvme, subnqn), spdk_json_decode_string, true}, + {"hostnqn", offsetof(struct rpc_construct_nvme, hostnqn), spdk_json_decode_string, true} +}; + +#define NVME_MAX_BDEVS_PER_RPC 128 + +static void +spdk_rpc_construct_nvme_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_nvme req = {}; + struct spdk_json_write_ctx *w; + struct spdk_nvme_transport_id trid = {}; + const char *names[NVME_MAX_BDEVS_PER_RPC]; + size_t count; + size_t i; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_nvme_decoders, + SPDK_COUNTOF(rpc_construct_nvme_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + /* Parse trtype */ + rc = spdk_nvme_transport_id_parse_trtype(&trid.trtype, req.trtype); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse trtype: %s\n", req.trtype); + goto invalid; + } + + /* Parse traddr */ + snprintf(trid.traddr, sizeof(trid.traddr), "%s", req.traddr); + + /* Parse adrfam */ + if (req.adrfam) { + rc = spdk_nvme_transport_id_parse_adrfam(&trid.adrfam, req.adrfam); + if (rc < 0) { + SPDK_ERRLOG("Failed to parse adrfam: %s\n", req.adrfam); + goto invalid; + } + } + + /* Parse trsvcid */ + if (req.trsvcid) { + snprintf(trid.trsvcid, sizeof(trid.trsvcid), "%s", req.trsvcid); + } + + /* Parse subnqn */ + if (req.subnqn) { + snprintf(trid.subnqn, sizeof(trid.subnqn), "%s", req.subnqn); + } + + count = NVME_MAX_BDEVS_PER_RPC; + if (spdk_bdev_nvme_create(&trid, req.name, names, &count, req.hostnqn)) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_nvme(&req); + return; + } + + spdk_json_write_array_begin(w); + for (i = 0; i < count; i++) { + spdk_json_write_string(w, names[i]); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + + free_rpc_construct_nvme(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_construct_nvme(&req); +} +SPDK_RPC_REGISTER("construct_nvme_bdev", spdk_rpc_construct_nvme_bdev, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_dump_nvme_controller_info(struct spdk_json_write_ctx *w, + struct nvme_ctrlr *nvme_ctrlr) +{ + struct spdk_nvme_transport_id *trid; + + trid = &nvme_ctrlr->trid; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "name", nvme_ctrlr->name); + + spdk_json_write_named_object_begin(w, "trid"); + spdk_bdev_nvme_dump_trid_json(trid, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +struct rpc_get_nvme_controllers { + char *name; +}; + +static void +free_rpc_get_nvme_controllers(struct rpc_get_nvme_controllers *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_get_nvme_controllers_decoders[] = { + {"name", offsetof(struct rpc_get_nvme_controllers, name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_nvme_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_nvme_controllers req = {}; + struct spdk_json_write_ctx *w; + struct nvme_ctrlr *ctrlr = NULL; + + if (params && spdk_json_decode_object(params, rpc_get_nvme_controllers_decoders, + SPDK_COUNTOF(rpc_get_nvme_controllers_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name) { + ctrlr = spdk_bdev_nvme_lookup_ctrlr(req.name); + if (ctrlr == NULL) { + SPDK_ERRLOG("ctrlr '%s' does not exist\n", req.name); + goto invalid; + } + } + + free_rpc_get_nvme_controllers(&req); + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + if (ctrlr != NULL) { + spdk_rpc_dump_nvme_controller_info(w, ctrlr); + } else { + for (ctrlr = spdk_bdev_nvme_first_ctrlr(); ctrlr; ctrlr = spdk_bdev_nvme_next_ctrlr(ctrlr)) { + spdk_rpc_dump_nvme_controller_info(w, ctrlr); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + free_rpc_get_nvme_controllers(&req); +} +SPDK_RPC_REGISTER("get_nvme_controllers", spdk_rpc_get_nvme_controllers, SPDK_RPC_RUNTIME) + +struct rpc_delete_nvme { + char *name; +}; + +static void +free_rpc_delete_nvme(struct rpc_delete_nvme *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_nvme_decoders[] = { + {"name", offsetof(struct rpc_delete_nvme, name), spdk_json_decode_string}, +}; + +static void +spdk_rpc_delete_nvme_ctrlr(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_nvme req = {NULL}; + struct spdk_json_write_ctx *w; + int rc = 0; + + if (spdk_json_decode_object(params, rpc_delete_nvme_decoders, + SPDK_COUNTOF(rpc_delete_nvme_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = spdk_bdev_nvme_delete(req.name); + if (rc != 0) { + goto invalid; + } + + free_rpc_delete_nvme(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_delete_nvme(&req); +} +SPDK_RPC_REGISTER("delete_nvme_controller", spdk_rpc_delete_nvme_ctrlr, SPDK_RPC_RUNTIME) + +struct rpc_apply_firmware { + char *filename; + char *bdev_name; +}; + +static void +free_rpc_apply_firmware(struct rpc_apply_firmware *req) +{ + free(req->filename); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_apply_firmware_decoders[] = { + {"filename", offsetof(struct rpc_apply_firmware, filename), spdk_json_decode_string}, + {"bdev_name", offsetof(struct rpc_apply_firmware, bdev_name), spdk_json_decode_string}, +}; + +struct firmware_update_info { + void *fw_image; + void *p; + unsigned int size; + unsigned int size_remaining; + unsigned int offset; + unsigned int transfer; + + void *desc; + struct spdk_io_channel *ch; + struct spdk_jsonrpc_request *request; + struct spdk_nvme_ctrlr *ctrlr; + open_descriptors_t desc_head; + struct rpc_apply_firmware *req; +}; + +static void +apply_firmware_cleanup(void *cb_arg) +{ + struct open_descriptors *opt, *tmp; + struct firmware_update_info *firm_ctx = cb_arg; + + if (!firm_ctx) { + return; + } + + if (firm_ctx->fw_image) { + spdk_dma_free(firm_ctx->fw_image); + } + + if (firm_ctx->req) { + free_rpc_apply_firmware(firm_ctx->req); + free(firm_ctx->req); + } + TAILQ_FOREACH_SAFE(opt, &firm_ctx->desc_head, tqlst, tmp) { + TAILQ_REMOVE(&firm_ctx->desc_head, opt, tqlst); + spdk_bdev_close(opt->desc); + free(opt); + } + free(firm_ctx); +} + +static void +apply_firmware_complete_reset(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + int rc; + struct spdk_json_write_ctx *w; + struct firmware_update_info *firm_ctx = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware commit failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((rc = spdk_nvme_ctrlr_reset(firm_ctx->ctrlr)) != 0) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Controller reset failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + if (!(w = spdk_jsonrpc_begin_result(firm_ctx->request))) { + apply_firmware_cleanup(firm_ctx); + return; + } + + spdk_json_write_string(w, "firmware commit succeeded. Controller reset in progress."); + spdk_jsonrpc_end_result(firm_ctx->request, w); + apply_firmware_cleanup(firm_ctx); +} + +static void +apply_firmware_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_nvme_cmd cmd = {}; + struct spdk_nvme_fw_commit fw_commit; + int slot = 0; + int rc; + struct firmware_update_info *firm_ctx = cb_arg; + enum spdk_nvme_fw_commit_action commit_action = SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG; + + if (!success) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware download failed ."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->p += firm_ctx->transfer; + firm_ctx->offset += firm_ctx->transfer; + firm_ctx->size_remaining -= firm_ctx->transfer; + + switch (firm_ctx->size_remaining) { + case 0: + /* firmware download completed. Commit firmware */ + memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); + fw_commit.fs = slot; + fw_commit.ca = commit_action; + + cmd.opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; + memcpy(&cmd.cdw10, &fw_commit, sizeof(uint32_t)); + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, NULL, 0, + apply_firmware_complete_reset, firm_ctx); + if (rc) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware commit failed."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + break; + default: + firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); + cmd.opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + + cmd.cdw10 = (firm_ctx->transfer >> 2) - 1; + cmd.cdw11 = firm_ctx->offset >> 2; + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, &cmd, firm_ctx->p, + firm_ctx->transfer, apply_firmware_complete, firm_ctx); + if (rc) { + spdk_jsonrpc_send_error_response(firm_ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "firmware download failed."); + spdk_bdev_free_io(bdev_io); + apply_firmware_cleanup(firm_ctx); + return; + } + break; + } +} + +static void +spdk_rpc_apply_nvme_firmware(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + int rc; + int fd = -1; + struct stat fw_stat; + struct spdk_nvme_ctrlr *ctrlr; + char msg[1024]; + struct spdk_bdev *bdev; + struct spdk_bdev *bdev2; + struct open_descriptors *opt; + struct spdk_bdev_desc *desc; + struct spdk_nvme_cmd *cmd; + struct firmware_update_info *firm_ctx; + + firm_ctx = malloc(sizeof(struct firmware_update_info)); + if (!firm_ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + return; + } + firm_ctx->fw_image = NULL; + TAILQ_INIT(&firm_ctx->desc_head); + firm_ctx->request = request; + + firm_ctx->req = malloc(sizeof(struct rpc_apply_firmware)); + if (!firm_ctx->req) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + free(firm_ctx); + return; + } + + if (spdk_json_decode_object(params, rpc_apply_firmware_decoders, + SPDK_COUNTOF(rpc_apply_firmware_decoders), firm_ctx->req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "spdk_json_decode_object failed."); + free(firm_ctx->req); + free(firm_ctx); + return; + } + + if ((bdev = spdk_bdev_get_by_name(firm_ctx->req->bdev_name)) == NULL) { + snprintf(msg, sizeof(msg), "bdev %s were not found", firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((ctrlr = spdk_bdev_nvme_get_ctrlr(bdev)) == NULL) { + snprintf(msg, sizeof(msg), "Controller information for %s were not found.", + firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + firm_ctx->ctrlr = ctrlr; + + for (bdev2 = spdk_bdev_first(); bdev2; bdev2 = spdk_bdev_next(bdev2)) { + + if (spdk_bdev_nvme_get_ctrlr(bdev2) != ctrlr) { + continue; + } + + if (!(opt = malloc(sizeof(struct open_descriptors)))) { + snprintf(msg, sizeof(msg), "Memory allocation error."); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + apply_firmware_cleanup(firm_ctx); + return; + } + + if ((rc = spdk_bdev_open(bdev2, true, NULL, NULL, &desc)) != 0) { + snprintf(msg, sizeof(msg), "Device %s is in use.", firm_ctx->req->bdev_name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, msg); + free(opt); + apply_firmware_cleanup(firm_ctx); + return; + } + + opt->desc = desc; + opt->bdev = bdev; + TAILQ_INSERT_TAIL(&firm_ctx->desc_head, opt, tqlst); + } + + /* + * find a descriptor associated with our bdev + */ + firm_ctx->desc = NULL; + TAILQ_FOREACH(opt, &firm_ctx->desc_head, tqlst) { + if (opt->bdev == bdev) { + firm_ctx->desc = opt->desc; + break; + } + } + + if (!firm_ctx->desc) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "No descriptor were found."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->ch = spdk_bdev_get_io_channel(firm_ctx->desc); + if (!firm_ctx->ch) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "No channels were found."); + apply_firmware_cleanup(firm_ctx); + return; + } + + fd = open(firm_ctx->req->filename, O_RDONLY); + if (fd < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "open file failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + rc = fstat(fd, &fw_stat); + if (rc < 0) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "fstat failed."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->size = fw_stat.st_size; + if (fw_stat.st_size % 4) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Firmware image size is not multiple of 4."); + apply_firmware_cleanup(firm_ctx); + return; + } + + firm_ctx->fw_image = spdk_dma_zmalloc(firm_ctx->size, 4096, NULL); + if (!firm_ctx->fw_image) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + apply_firmware_cleanup(firm_ctx); + return; + } + firm_ctx->p = firm_ctx->fw_image; + + if (read(fd, firm_ctx->p, firm_ctx->size) != ((ssize_t)(firm_ctx->size))) { + close(fd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Read firmware image failed!"); + apply_firmware_cleanup(firm_ctx); + return; + } + close(fd); + + firm_ctx->offset = 0; + firm_ctx->size_remaining = firm_ctx->size; + firm_ctx->transfer = spdk_min(firm_ctx->size_remaining, 4096); + + cmd = malloc(sizeof(struct spdk_nvme_cmd)); + if (!cmd) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Memory allocation error."); + apply_firmware_cleanup(firm_ctx); + return; + } + memset(cmd, 0, sizeof(struct spdk_nvme_cmd)); + cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + + cmd->cdw10 = (firm_ctx->transfer >> 2) - 1; + cmd->cdw11 = firm_ctx->offset >> 2; + + rc = spdk_bdev_nvme_admin_passthru(firm_ctx->desc, firm_ctx->ch, cmd, firm_ctx->p, + firm_ctx->transfer, apply_firmware_complete, firm_ctx); + if (rc) { + free(cmd); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Read firmware image failed!"); + apply_firmware_cleanup(firm_ctx); + return; + } +} +SPDK_RPC_REGISTER("apply_nvme_firmware", spdk_rpc_apply_nvme_firmware, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/nvme/nvme_rpc.c b/src/spdk/lib/bdev/nvme/nvme_rpc.c new file mode 100644 index 00000000..b49a7d42 --- /dev/null +++ b/src/spdk/lib/bdev/nvme/nvme_rpc.c @@ -0,0 +1,487 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_nvme.h" +#include "spdk/base64.h" + +enum spdk_nvme_rpc_type { + NVME_ADMIN_CMD = 1, + NVME_IO_CMD, +}; + +struct rpc_send_nvme_cmd_req { + char *name; + int cmd_type; + int data_direction; + uint32_t timeout_ms; + uint32_t data_len; + uint32_t md_len; + + struct spdk_nvme_cmd *cmdbuf; + char *data; + char *md; +}; + +struct rpc_send_nvme_cmd_resp { + char *cpl_text; + char *data_text; + char *md_text; +}; + +struct rpc_send_nvme_cmd_ctx { + struct spdk_jsonrpc_request *jsonrpc_request; + struct rpc_send_nvme_cmd_req req; + struct rpc_send_nvme_cmd_resp resp; + struct nvme_ctrlr *nvme_ctrlr; + struct spdk_io_channel *ctrlr_io_ch; +}; + +static void +free_rpc_send_nvme_cmd_ctx(struct rpc_send_nvme_cmd_ctx *ctx) +{ + assert(ctx != NULL); + + free(ctx->req.name); + free(ctx->req.cmdbuf); + spdk_dma_free(ctx->req.data); + spdk_dma_free(ctx->req.md); + free(ctx->resp.cpl_text); + free(ctx->resp.data_text); + free(ctx->resp.md_text); + free(ctx); +} + +static int +rpc_send_nvme_cmd_resp_construct(struct rpc_send_nvme_cmd_resp *resp, + struct rpc_send_nvme_cmd_req *req, + const struct spdk_nvme_cpl *cpl) +{ + resp->cpl_text = malloc(spdk_base64_get_encoded_strlen(sizeof(*cpl)) + 1); + if (!resp->cpl_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->cpl_text, cpl, sizeof(*cpl)); + + if (req->data_direction == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + if (req->data_len) { + resp->data_text = malloc(spdk_base64_get_encoded_strlen(req->data_len) + 1); + if (!resp->data_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->data_text, req->data, req->data_len); + } + if (req->md_len) { + resp->md_text = malloc(spdk_base64_get_encoded_strlen(req->md_len) + 1); + if (!resp->md_text) { + return -ENOMEM; + } + spdk_base64_urlsafe_encode(resp->md_text, req->md, req->md_len); + } + } + + return 0; +} + +static void +spdk_rpc_send_nvme_cmd_complete(struct rpc_send_nvme_cmd_ctx *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_jsonrpc_request *request = ctx->jsonrpc_request; + struct spdk_json_write_ctx *w; + int ret; + + ret = rpc_send_nvme_cmd_resp_construct(&ctx->resp, &ctx->req, cpl); + if (ret) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(-ret)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + goto out; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "cpl", ctx->resp.cpl_text); + + if (ctx->resp.data_text) { + spdk_json_write_named_string(w, "data", ctx->resp.data_text); + } + + if (ctx->resp.md_text) { + spdk_json_write_named_string(w, "metadata", ctx->resp.md_text); + } + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); + +out: + free_rpc_send_nvme_cmd_ctx(ctx); + return; +} + +static void +nvme_rpc_bdev_nvme_cb(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct rpc_send_nvme_cmd_ctx *ctx = (struct rpc_send_nvme_cmd_ctx *)ref; + + if (ctx->ctrlr_io_ch) { + spdk_put_io_channel(ctx->ctrlr_io_ch); + ctx->ctrlr_io_ch = NULL; + } + + spdk_rpc_send_nvme_cmd_complete(ctx, cpl); +} + +static int +nvme_rpc_admin_cmd_bdev_nvme(struct rpc_send_nvme_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, + void *buf, uint32_t nbytes, uint32_t timeout_ms) +{ + struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; + int ret; + + ret = spdk_nvme_ctrlr_cmd_admin_raw(_nvme_ctrlr->ctrlr, cmd, buf, + nbytes, nvme_rpc_bdev_nvme_cb, ctx); + + return ret; +} + +static int +nvme_rpc_io_cmd_bdev_nvme(struct rpc_send_nvme_cmd_ctx *ctx, struct spdk_nvme_cmd *cmd, + void *buf, uint32_t nbytes, void *md_buf, uint32_t md_len, + uint32_t timeout_ms) +{ + struct nvme_ctrlr *_nvme_ctrlr = ctx->nvme_ctrlr; + struct spdk_nvme_qpair *io_qpair; + int ret; + + ctx->ctrlr_io_ch = spdk_get_io_channel(_nvme_ctrlr->ctrlr); + io_qpair = spdk_bdev_nvme_get_io_qpair(ctx->ctrlr_io_ch); + + ret = spdk_nvme_ctrlr_cmd_io_raw_with_md(_nvme_ctrlr->ctrlr, io_qpair, + cmd, buf, nbytes, md_buf, nvme_rpc_bdev_nvme_cb, ctx); + if (ret) { + spdk_put_io_channel(ctx->ctrlr_io_ch); + } + + return ret; + +} + +static int +rpc_send_nvme_cmd_exec(struct rpc_send_nvme_cmd_ctx *ctx) +{ + struct rpc_send_nvme_cmd_req *req = &ctx->req; + int ret = -EINVAL; + + switch (req->cmd_type) { + case NVME_ADMIN_CMD: + ret = nvme_rpc_admin_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, + req->data_len, req->timeout_ms); + break; + case NVME_IO_CMD: + ret = nvme_rpc_io_cmd_bdev_nvme(ctx, req->cmdbuf, req->data, + req->data_len, req->md, req->md_len, req->timeout_ms); + break; + } + + return ret; +} + +static int +rpc_decode_cmd_type(const struct spdk_json_val *val, void *out) +{ + int *cmd_type = out; + + if (spdk_json_strequal(val, "admin") == true) { + *cmd_type = NVME_ADMIN_CMD; + } else if (spdk_json_strequal(val, "io") == true) { + *cmd_type = NVME_IO_CMD; + } else { + SPDK_NOTICELOG("Invalid parameter value: cmd_type\n"); + return -EINVAL; + } + + return 0; +} + +static int +rpc_decode_data_direction(const struct spdk_json_val *val, void *out) +{ + int *data_direction = out; + + if (spdk_json_strequal(val, "h2c") == true) { + *data_direction = SPDK_NVME_DATA_HOST_TO_CONTROLLER; + } else if (spdk_json_strequal(val, "c2h") == true) { + *data_direction = SPDK_NVME_DATA_CONTROLLER_TO_HOST; + } else { + SPDK_NOTICELOG("Invalid parameter value: data_direction\n"); + return -EINVAL; + } + + return 0; +} + +static int +rpc_decode_cmdbuf(const struct spdk_json_val *val, void *out) +{ + char *text = NULL; + size_t text_strlen, raw_len; + struct spdk_nvme_cmd *cmdbuf, **_cmdbuf = out; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + + text_strlen = strlen(text); + raw_len = spdk_base64_get_decoded_len(text_strlen); + cmdbuf = malloc(raw_len); + if (!cmdbuf) { + rc = -ENOMEM; + goto out; + } + + rc = spdk_base64_urlsafe_decode(cmdbuf, &raw_len, text); + if (rc) { + goto out; + } + if (raw_len != sizeof(*cmdbuf)) { + rc = -EINVAL; + goto out; + } + + *_cmdbuf = cmdbuf; + +out: + free(text); + return rc; +} + +static int +rpc_decode_data(const struct spdk_json_val *val, void *out) +{ + struct rpc_send_nvme_cmd_req *req = (struct rpc_send_nvme_cmd_req *)out; + char *text = NULL; + size_t text_strlen; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + text_strlen = strlen(text); + + if (req->data_len) { + /* data_len is decoded by param "data_len" */ + if (req->data_len != spdk_base64_get_decoded_len(text_strlen)) { + rc = -EINVAL; + goto out; + } + } else { + req->data_len = spdk_base64_get_decoded_len(text_strlen); + req->data = spdk_dma_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, NULL); + if (!req->data) { + rc = -ENOMEM; + goto out; + } + } + + rc = spdk_base64_urlsafe_decode(req->data, (size_t *)&req->data_len, text); + +out: + free(text); + return rc; +} + +static int +rpc_decode_data_len(const struct spdk_json_val *val, void *out) +{ + struct rpc_send_nvme_cmd_req *req = (struct rpc_send_nvme_cmd_req *)out; + uint32_t data_len; + int rc; + + rc = spdk_json_decode_uint32(val, &data_len); + if (rc) { + return rc; + } + + if (req->data_len) { + /* data_len is decoded by param "data" */ + if (req->data_len != data_len) { + rc = -EINVAL; + } + } else { + req->data_len = data_len; + req->data = spdk_dma_malloc(req->data_len > 0x1000 ? req->data_len : 0x1000, 0x1000, NULL); + if (!req->data) { + rc = -ENOMEM; + } + } + + return rc; +} + +static int +rpc_decode_metadata(const struct spdk_json_val *val, void *out) +{ + struct rpc_send_nvme_cmd_req *req = (struct rpc_send_nvme_cmd_req *)out; + char *text = NULL; + size_t text_strlen; + int rc; + + rc = spdk_json_decode_string(val, &text); + if (rc) { + return rc = val->type == SPDK_JSON_VAL_STRING ? -ENOMEM : -EINVAL; + } + text_strlen = strlen(text); + + if (req->md_len) { + /* md_len is decoded by param "metadata_len" */ + if (req->md_len != spdk_base64_get_decoded_len(text_strlen)) { + rc = -EINVAL; + goto out; + } + } else { + req->md_len = spdk_base64_get_decoded_len(text_strlen); + req->md = spdk_dma_malloc(req->md_len, 0x1000, NULL); + if (!req->md) { + rc = -ENOMEM; + goto out; + } + } + + rc = spdk_base64_urlsafe_decode(req->md, (size_t *)&req->md_len, text); + +out: + free(text); + return rc; +} + +static int +rpc_decode_metadata_len(const struct spdk_json_val *val, void *out) +{ + struct rpc_send_nvme_cmd_req *req = (struct rpc_send_nvme_cmd_req *)out; + uint32_t md_len; + int rc; + + rc = spdk_json_decode_uint32(val, &md_len); + if (rc) { + return rc; + } + + if (req->md_len) { + /* md_len is decoded by param "metadata" */ + if (req->md_len != md_len) { + rc = -EINVAL; + } + } else { + req->md_len = md_len; + req->md = spdk_dma_malloc(req->md_len, 0x1000, NULL); + if (!req->md) { + rc = -ENOMEM; + } + } + + return rc; +} + +static const struct spdk_json_object_decoder rpc_send_nvme_cmd_req_decoders[] = { + {"name", offsetof(struct rpc_send_nvme_cmd_req, name), spdk_json_decode_string}, + {"cmd_type", offsetof(struct rpc_send_nvme_cmd_req, cmd_type), rpc_decode_cmd_type}, + {"data_direction", offsetof(struct rpc_send_nvme_cmd_req, data_direction), rpc_decode_data_direction}, + {"cmdbuf", offsetof(struct rpc_send_nvme_cmd_req, cmdbuf), rpc_decode_cmdbuf}, + {"timeout_ms", offsetof(struct rpc_send_nvme_cmd_req, timeout_ms), spdk_json_decode_uint32, true}, + {"data_len", 0, rpc_decode_data_len, true}, + {"metadata_len", 0, rpc_decode_metadata_len, true}, + {"data", 0, rpc_decode_data, true}, + {"metadata", 0, rpc_decode_metadata, true}, +}; + +static void +spdk_rpc_send_nvme_cmd(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_send_nvme_cmd_ctx *ctx; + int ret, error_code; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Failed at Malloc ctx\n"); + error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; + ret = -ENOMEM; + goto invalid; + } + + if (spdk_json_decode_object(params, rpc_send_nvme_cmd_req_decoders, + SPDK_COUNTOF(rpc_send_nvme_cmd_req_decoders), + &ctx->req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; + ret = -EINVAL; + goto invalid; + } + + ctx->nvme_ctrlr = spdk_bdev_nvme_lookup_ctrlr(ctx->req.name); + if (ctx->nvme_ctrlr == NULL) { + SPDK_ERRLOG("Failed at device lookup\n"); + error_code = SPDK_JSONRPC_ERROR_INVALID_PARAMS; + ret = -EINVAL; + goto invalid; + } + + ctx->jsonrpc_request = request; + + ret = rpc_send_nvme_cmd_exec(ctx); + if (ret < 0) { + SPDK_NOTICELOG("Failed at rpc_send_nvme_cmd_exec\n"); + error_code = SPDK_JSONRPC_ERROR_INTERNAL_ERROR; + goto invalid; + } + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, error_code, spdk_strerror(-ret)); + free_rpc_send_nvme_cmd_ctx(ctx); + return; +} +SPDK_RPC_REGISTER("send_nvme_cmd", spdk_rpc_send_nvme_cmd, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/part.c b/src/spdk/lib/bdev/part.c new file mode 100644 index 00000000..0cb4759b --- /dev/null +++ b/src/spdk/lib/bdev/part.c @@ -0,0 +1,373 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Common code for partition-like virtual bdevs. + */ + +#include "spdk/bdev.h" +#include "spdk/log.h" +#include "spdk/string.h" + +#include "spdk/bdev_module.h" + +struct spdk_bdev_part_base { + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + uint32_t ref; + uint32_t channel_size; + spdk_bdev_part_base_free_fn base_free_fn; + void *ctx; + bool claimed; + struct spdk_bdev_module *module; + struct spdk_bdev_fn_table *fn_table; + struct bdev_part_tailq *tailq; + spdk_io_channel_create_cb ch_create_cb; + spdk_io_channel_destroy_cb ch_destroy_cb; +}; + +struct spdk_bdev * +spdk_bdev_part_base_get_bdev(struct spdk_bdev_part_base *part_base) +{ + return part_base->bdev; +} + +struct spdk_bdev_desc * +spdk_bdev_part_base_get_desc(struct spdk_bdev_part_base *part_base) +{ + return part_base->desc; +} + +struct bdev_part_tailq * +spdk_bdev_part_base_get_tailq(struct spdk_bdev_part_base *part_base) +{ + return part_base->tailq; +} + +void * +spdk_bdev_part_base_get_ctx(struct spdk_bdev_part_base *part_base) +{ + return part_base->ctx; +} + +void +spdk_bdev_part_base_free(struct spdk_bdev_part_base *base) +{ + if (base->desc) { + spdk_bdev_close(base->desc); + base->desc = NULL; + } + + if (base->base_free_fn != NULL) { + base->base_free_fn(base->ctx); + } + + free(base); +} + +static void +spdk_bdev_part_free_cb(void *io_device) +{ + struct spdk_bdev_part *part = io_device; + struct spdk_bdev_part_base *base; + + assert(part); + assert(part->internal.base); + + base = part->internal.base; + + TAILQ_REMOVE(base->tailq, part, tailq); + + if (__sync_sub_and_fetch(&base->ref, 1) == 0) { + spdk_bdev_module_release_bdev(base->bdev); + spdk_bdev_part_base_free(base); + } + + spdk_bdev_destruct_done(&part->internal.bdev, 0); + free(part->internal.bdev.name); + free(part->internal.bdev.product_name); + free(part); +} + +int +spdk_bdev_part_free(struct spdk_bdev_part *part) +{ + spdk_io_device_unregister(part, spdk_bdev_part_free_cb); + + /* Return 1 to indicate that this is an asynchronous operation that isn't complete + * until spdk_bdev_destruct_done is called */ + return 1; +} + +void +spdk_bdev_part_base_hotremove(struct spdk_bdev *base_bdev, struct bdev_part_tailq *tailq) +{ + struct spdk_bdev_part *part, *tmp; + + TAILQ_FOREACH_SAFE(part, tailq, tailq, tmp) { + if (part->internal.base->bdev == base_bdev) { + spdk_bdev_unregister(&part->internal.bdev, NULL, NULL); + } + } +} + +static bool +spdk_bdev_part_io_type_supported(void *_part, enum spdk_bdev_io_type io_type) +{ + struct spdk_bdev_part *part = _part; + + return part->internal.base->bdev->fn_table->io_type_supported(part->internal.base->bdev->ctxt, + io_type); +} + +static struct spdk_io_channel * +spdk_bdev_part_get_io_channel(void *_part) +{ + struct spdk_bdev_part *part = _part; + + return spdk_get_io_channel(part); +} + +struct spdk_bdev * +spdk_bdev_part_get_bdev(struct spdk_bdev_part *part) +{ + return &part->internal.bdev; +} + +struct spdk_bdev_part_base * +spdk_bdev_part_get_base(struct spdk_bdev_part *part) +{ + return part->internal.base; +} + +struct spdk_bdev * +spdk_bdev_part_get_base_bdev(struct spdk_bdev_part *part) +{ + return part->internal.base->bdev; +} + +uint64_t +spdk_bdev_part_get_offset_blocks(struct spdk_bdev_part *part) +{ + return part->internal.offset_blocks; +} + +static void +spdk_bdev_part_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *part_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_bdev_io_complete(part_io, status); + spdk_bdev_free_io(bdev_io); +} + +int +spdk_bdev_part_submit_request(struct spdk_bdev_part_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct spdk_bdev_part *part = ch->part; + struct spdk_io_channel *base_ch = ch->base_ch; + struct spdk_bdev_desc *base_desc = part->internal.base->desc; + uint64_t offset; + int rc = 0; + + /* Modify the I/O to adjust for the offset within the base bdev. */ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + offset = bdev_io->u.bdev.offset_blocks + part->internal.offset_blocks; + rc = spdk_bdev_readv_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, offset, + bdev_io->u.bdev.num_blocks, spdk_bdev_part_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + offset = bdev_io->u.bdev.offset_blocks + part->internal.offset_blocks; + rc = spdk_bdev_writev_blocks(base_desc, base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, offset, + bdev_io->u.bdev.num_blocks, spdk_bdev_part_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + offset = bdev_io->u.bdev.offset_blocks + part->internal.offset_blocks; + rc = spdk_bdev_write_zeroes_blocks(base_desc, base_ch, offset, bdev_io->u.bdev.num_blocks, + spdk_bdev_part_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + offset = bdev_io->u.bdev.offset_blocks + part->internal.offset_blocks; + rc = spdk_bdev_unmap_blocks(base_desc, base_ch, offset, bdev_io->u.bdev.num_blocks, + spdk_bdev_part_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + offset = bdev_io->u.bdev.offset_blocks + part->internal.offset_blocks; + rc = spdk_bdev_flush_blocks(base_desc, base_ch, offset, bdev_io->u.bdev.num_blocks, + spdk_bdev_part_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(base_desc, base_ch, + spdk_bdev_part_complete_io, bdev_io); + break; + default: + SPDK_ERRLOG("split: unknown I/O type %d\n", bdev_io->type); + return SPDK_BDEV_IO_STATUS_FAILED; + } + + return rc; +} + +static int +spdk_bdev_part_channel_create_cb(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device; + struct spdk_bdev_part_channel *ch = ctx_buf; + + ch->part = part; + ch->base_ch = spdk_bdev_get_io_channel(part->internal.base->desc); + if (ch->base_ch == NULL) { + return -1; + } + + if (part->internal.base->ch_create_cb) { + return part->internal.base->ch_create_cb(io_device, ctx_buf); + } else { + return 0; + } +} + +static void +spdk_bdev_part_channel_destroy_cb(void *io_device, void *ctx_buf) +{ + struct spdk_bdev_part *part = (struct spdk_bdev_part *)io_device; + struct spdk_bdev_part_channel *ch = ctx_buf; + + if (part->internal.base->ch_destroy_cb) { + part->internal.base->ch_destroy_cb(io_device, ctx_buf); + } + spdk_put_io_channel(ch->base_ch); +} + +struct spdk_bdev_part_base * + spdk_bdev_part_base_construct(struct spdk_bdev *bdev, + spdk_bdev_remove_cb_t remove_cb, struct spdk_bdev_module *module, + struct spdk_bdev_fn_table *fn_table, struct bdev_part_tailq *tailq, + spdk_bdev_part_base_free_fn free_fn, void *ctx, + uint32_t channel_size, spdk_io_channel_create_cb ch_create_cb, + spdk_io_channel_destroy_cb ch_destroy_cb) +{ + int rc; + struct spdk_bdev_part_base *base; + + base = calloc(1, sizeof(*base)); + if (!base) { + SPDK_ERRLOG("Memory allocation failure\n"); + return NULL; + } + fn_table->get_io_channel = spdk_bdev_part_get_io_channel; + fn_table->io_type_supported = spdk_bdev_part_io_type_supported; + + base->bdev = bdev; + base->desc = NULL; + base->ref = 0; + base->module = module; + base->fn_table = fn_table; + base->tailq = tailq; + base->base_free_fn = free_fn; + base->ctx = ctx; + base->claimed = false; + base->channel_size = channel_size; + base->ch_create_cb = ch_create_cb; + base->ch_destroy_cb = ch_destroy_cb; + + rc = spdk_bdev_open(bdev, false, remove_cb, bdev, &base->desc); + if (rc) { + spdk_bdev_part_base_free(base); + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + return NULL; + } + + return base; +} + +int +spdk_bdev_part_construct(struct spdk_bdev_part *part, struct spdk_bdev_part_base *base, + char *name, uint64_t offset_blocks, uint64_t num_blocks, + char *product_name) +{ + part->internal.bdev.blocklen = base->bdev->blocklen; + part->internal.bdev.blockcnt = num_blocks; + part->internal.offset_blocks = offset_blocks; + + part->internal.bdev.write_cache = base->bdev->write_cache; + part->internal.bdev.need_aligned_buffer = base->bdev->need_aligned_buffer; + part->internal.bdev.ctxt = part; + part->internal.bdev.module = base->module; + part->internal.bdev.fn_table = base->fn_table; + + part->internal.bdev.name = strdup(name); + part->internal.bdev.product_name = strdup(product_name); + + if (part->internal.bdev.name == NULL) { + SPDK_ERRLOG("Failed to allocate name for new part of bdev %s\n", spdk_bdev_get_name(base->bdev)); + return -1; + } else if (part->internal.bdev.product_name == NULL) { + free(part->internal.bdev.name); + SPDK_ERRLOG("Failed to allocate product name for new part of bdev %s\n", + spdk_bdev_get_name(base->bdev)); + return -1; + } + + __sync_fetch_and_add(&base->ref, 1); + part->internal.base = base; + + if (!base->claimed) { + int rc; + + rc = spdk_bdev_module_claim_bdev(base->bdev, base->desc, base->module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(base->bdev)); + free(part->internal.bdev.name); + free(part->internal.bdev.product_name); + return -1; + } + base->claimed = true; + } + + spdk_io_device_register(part, spdk_bdev_part_channel_create_cb, + spdk_bdev_part_channel_destroy_cb, + base->channel_size, + name); + + spdk_vbdev_register(&part->internal.bdev, &base->bdev, 1); + TAILQ_INSERT_TAIL(base->tailq, part, tailq); + + return 0; +} diff --git a/src/spdk/lib/bdev/passthru/Makefile b/src/spdk/lib/bdev/passthru/Makefile new file mode 100644 index 00000000..5a2a383a --- /dev/null +++ b/src/spdk/lib/bdev/passthru/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ + +C_SRCS = vbdev_passthru.c vbdev_passthru_rpc.c +LIBNAME = vbdev_passthru + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/passthru/vbdev_passthru.c b/src/spdk/lib/bdev/passthru/vbdev_passthru.c new file mode 100644 index 00000000..4e3dacfc --- /dev/null +++ b/src/spdk/lib/bdev/passthru/vbdev_passthru.c @@ -0,0 +1,671 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a simple example of a virtual block device module that passes IO + * down to a bdev (or bdevs) that its configured to attach to. + */ + +#include "spdk/stdinc.h" + +#include "vbdev_passthru.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + + +static int vbdev_passthru_init(void); +static void vbdev_passthru_get_spdk_running_config(FILE *fp); +static int vbdev_passthru_get_ctx_size(void); +static void vbdev_passthru_examine(struct spdk_bdev *bdev); +static void vbdev_passthru_finish(void); + +static struct spdk_bdev_module passthru_if = { + .name = "passthru", + .module_init = vbdev_passthru_init, + .config_text = vbdev_passthru_get_spdk_running_config, + .get_ctx_size = vbdev_passthru_get_ctx_size, + .examine_config = vbdev_passthru_examine, + .module_fini = vbdev_passthru_finish +}; + +SPDK_BDEV_MODULE_REGISTER(&passthru_if) + +/* List of pt_bdev names and their base bdevs via configuration file. + * Used so we can parse the conf once at init and use this list in examine(). + */ +struct bdev_names { + char *vbdev_name; + char *bdev_name; + TAILQ_ENTRY(bdev_names) link; +}; +static TAILQ_HEAD(, bdev_names) g_bdev_names = TAILQ_HEAD_INITIALIZER(g_bdev_names); + +/* List of virtual bdevs and associated info for each. */ +struct vbdev_passthru { + struct spdk_bdev *base_bdev; /* the thing we're attaching to */ + struct spdk_bdev_desc *base_desc; /* its descriptor we get from open */ + struct spdk_bdev pt_bdev; /* the PT virtual bdev */ + TAILQ_ENTRY(vbdev_passthru) link; +}; +static TAILQ_HEAD(, vbdev_passthru) g_pt_nodes = TAILQ_HEAD_INITIALIZER(g_pt_nodes); + +/* The pt vbdev channel struct. It is allocated and freed on my behalf by the io channel code. + * If this vbdev needed to implement a poller or a queue for IO, this is where those things + * would be defined. This passthru bdev doesn't actually need to allocate a channel, it could + * simply pass back the channel of the bdev underneath it but for example purposes we will + * present its own to the upper layers. + */ +struct pt_io_channel { + struct spdk_io_channel *base_ch; /* IO channel of base device */ +}; + +/* Just for fun, this pt_bdev module doesn't need it but this is essentially a per IO + * context that we get handed by the bdev layer. + */ +struct passthru_bdev_io { + uint8_t test; + + /* bdev related */ + struct spdk_io_channel *ch; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static void +vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io); + +/* Called after we've unregistered following a hot remove callback. + * Our finish entry point will be called next. + */ +static int +vbdev_passthru_destruct(void *ctx) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + /* Unclaim the underlying bdev. */ + spdk_bdev_module_release_bdev(pt_node->base_bdev); + + /* Close the underlying bdev. */ + spdk_bdev_close(pt_node->base_desc); + + /* Done with this pt_node. */ + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + free(pt_node->pt_bdev.name); + free(pt_node); + return 0; +} + +/* Completion callback for IO that were issued from this bdev. The original bdev_io + * is passed in as an arg so we'll complete that one with the appropriate status + * and then free the one that this module issued. + */ +static void +_pt_complete_io(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *orig_io = cb_arg; + int status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)orig_io->driver_ctx; + + /* We setup this value in the submission routine, just showing here that it is + * passed back to us. + */ + if (io_ctx->test != 0x5a) { + SPDK_ERRLOG("Error, original IO device_ctx is wrong! 0x%x\n", + io_ctx->test); + } + + /* Complete the original IO and then free the one that we created here + * as a result of issuing an IO via submit_reqeust. + */ + spdk_bdev_io_complete(orig_io, status); + spdk_bdev_free_io(bdev_io); +} + +static void +vbdev_passthru_resubmit_io(void *arg) +{ + struct spdk_bdev_io *bdev_io = (struct spdk_bdev_io *)arg; + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + + vbdev_passthru_submit_request(io_ctx->ch, bdev_io); +} + +static void +vbdev_passthru_queue_io(struct spdk_bdev_io *bdev_io) +{ + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + int rc; + + io_ctx->bdev_io_wait.bdev = bdev_io->bdev; + io_ctx->bdev_io_wait.cb_fn = vbdev_passthru_resubmit_io; + io_ctx->bdev_io_wait.cb_arg = bdev_io; + + rc = spdk_bdev_queue_io_wait(bdev_io->bdev, io_ctx->ch, &io_ctx->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_passthru_queue_io, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* Callback for getting a buf from the bdev pool in the event that the caller passed + * in NULL, we need to own the buffer so it doesn't get freed by another vbdev module + * beneath us before we're done with it. That won't happen in this example but it could + * if this example were used as a template for something more complex. + */ +static void +pt_read_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, + pt_bdev); + struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); + + spdk_bdev_readv_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _pt_complete_io, + bdev_io); +} + +/* Called when someone above submits IO to this pt vbdev. We're simply passing it on here + * via SPDK IO calls which in turn allocate another bdev IO and call our cpl callback provided + * below along with the original bdiv_io so that we can complete it once this IO completes. + */ +static void +vbdev_passthru_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev_io->bdev, struct vbdev_passthru, pt_bdev); + struct pt_io_channel *pt_ch = spdk_io_channel_get_ctx(ch); + struct passthru_bdev_io *io_ctx = (struct passthru_bdev_io *)bdev_io->driver_ctx; + int rc = 0; + + /* Setup a per IO context value; we don't do anything with it in the vbdev other + * than confirm we get the same thing back in the completion callback just to + * demonstrate. + */ + io_ctx->test = 0x5a; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, pt_read_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + rc = spdk_bdev_writev_blocks(pt_node->base_desc, pt_ch->base_ch, bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, _pt_complete_io, + bdev_io); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + rc = spdk_bdev_write_zeroes_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + rc = spdk_bdev_unmap_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + rc = spdk_bdev_flush_blocks(pt_node->base_desc, pt_ch->base_ch, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + _pt_complete_io, bdev_io); + break; + case SPDK_BDEV_IO_TYPE_RESET: + rc = spdk_bdev_reset(pt_node->base_desc, pt_ch->base_ch, + _pt_complete_io, bdev_io); + break; + default: + SPDK_ERRLOG("passthru: unknown I/O type %d\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + if (rc != 0) { + if (rc == -ENOMEM) { + SPDK_ERRLOG("No memory, start to queue io for passthru.\n"); + io_ctx->ch = ch; + vbdev_passthru_queue_io(bdev_io); + } else { + SPDK_ERRLOG("ERROR on bdev_io submission!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* We'll just call the base bdev and let it answer however if we were more + * restrictive for some reason (or less) we could get the response back + * and modify according to our purposes. + */ +static bool +vbdev_passthru_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + return spdk_bdev_io_type_supported(pt_node->base_bdev, io_type); +} + +/* We supplied this as an entry point for upper layers who want to communicate to this + * bdev. This is how they get a channel. We are passed the same context we provided when + * we created our PT vbdev in examine() which, for this bdev, is the address of one of + * our context nodes. From here we'll ask the SPDK channel code to fill out our channel + * struct and we'll keep it in our PT node. + */ +static struct spdk_io_channel * +vbdev_passthru_get_io_channel(void *ctx) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + struct spdk_io_channel *pt_ch = NULL; + + /* The IO channel code will allocate a channel for us which consists of + * the SPDK channel structure plus the size of our pt_io_channel struct + * that we passed in when we registered our IO device. It will then call + * our channel create callback to populate any elements that we need to + * update. + */ + pt_ch = spdk_get_io_channel(pt_node); + + return pt_ch; +} + +static int +vbdev_passthru_info_config_json(void *ctx, struct spdk_json_write_ctx *write_ctx) +{ + struct vbdev_passthru *pt_node = (struct vbdev_passthru *)ctx; + + /* This is the output for get_bdevs() for this vbdev */ + spdk_json_write_name(write_ctx, "passthru"); + spdk_json_write_object_begin(write_ctx); + + spdk_json_write_name(write_ctx, "pt_bdev_name"); + spdk_json_write_string(write_ctx, spdk_bdev_get_name(&pt_node->pt_bdev)); + + spdk_json_write_name(write_ctx, "base_bdev_name"); + spdk_json_write_string(write_ctx, spdk_bdev_get_name(pt_node->base_bdev)); + + spdk_json_write_object_end(write_ctx); + + return 0; +} + +/* We provide this callback for the SPDK channel code to create a channel using + * the channel struct we provided in our module get_io_channel() entry point. Here + * we get and save off an underlying base channel of the device below us so that + * we can communicate with the base bdev on a per channel basis. If we needed + * our own poller for this vbdev, we'd register it here. + */ +static int +pt_bdev_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct pt_io_channel *pt_ch = ctx_buf; + struct vbdev_passthru *pt_node = io_device; + + pt_ch->base_ch = spdk_bdev_get_io_channel(pt_node->base_desc); + + return 0; +} + +/* We provide this callback for the SPDK channel code to destroy a channel + * created with our create callback. We just need to undo anything we did + * when we created. If this bdev used its own poller, we'd unregsiter it here. + */ +static void +pt_bdev_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct pt_io_channel *pt_ch = ctx_buf; + + spdk_put_io_channel(pt_ch->base_ch); +} + +/* Create the passthru association from the bdev and vbdev name and insert + * on the global list. */ +static int +vbdev_passthru_insert_name(const char *bdev_name, const char *vbdev_name) +{ + struct bdev_names *name; + + name = calloc(1, sizeof(struct bdev_names)); + if (!name) { + SPDK_ERRLOG("could not allocate bdev_names\n"); + return -ENOMEM; + } + + name->bdev_name = strdup(bdev_name); + if (!name->bdev_name) { + SPDK_ERRLOG("could not allocate name->bdev_name\n"); + free(name); + return -ENOMEM; + } + + name->vbdev_name = strdup(vbdev_name); + if (!name->vbdev_name) { + SPDK_ERRLOG("could not allocate name->vbdev_name\n"); + free(name->bdev_name); + free(name); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_bdev_names, name, link); + + return 0; +} + +/* On init, just parse config file and build list of pt vbdevs and bdev name pairs. */ +static int +vbdev_passthru_init(void) +{ + struct spdk_conf_section *sp = NULL; + const char *conf_bdev_name = NULL; + const char *conf_vbdev_name = NULL; + struct bdev_names *name; + int i, rc; + + sp = spdk_conf_find_section(NULL, "Passthru"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "PT", i)) { + break; + } + + conf_bdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 0); + if (!conf_bdev_name) { + SPDK_ERRLOG("Passthru configuration missing bdev name\n"); + break; + } + + conf_vbdev_name = spdk_conf_section_get_nmval(sp, "PT", i, 1); + if (!conf_vbdev_name) { + SPDK_ERRLOG("Passthru configuration missing pt_bdev name\n"); + break; + } + + rc = vbdev_passthru_insert_name(conf_bdev_name, conf_vbdev_name); + if (rc != 0) { + return rc; + } + } + TAILQ_FOREACH(name, &g_bdev_names, link) { + SPDK_NOTICELOG("conf parse matched: %s\n", name->bdev_name); + } + return 0; +} + +/* Called when the entire module is being torn down. */ +static void +vbdev_passthru_finish(void) +{ + struct bdev_names *name; + + while ((name = TAILQ_FIRST(&g_bdev_names))) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name); + } +} + +/* During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_passthru_get_ctx_size(void) +{ + return sizeof(struct passthru_bdev_io); +} + +/* Called when SPDK wants to save the current config of this vbdev module to + * a file. + */ +static void +vbdev_passthru_get_spdk_running_config(FILE *fp) +{ + struct bdev_names *names = NULL; + + fprintf(fp, "\n[Passthru]\n"); + TAILQ_FOREACH(names, &g_bdev_names, link) { + fprintf(fp, " PT %s %s\n", names->bdev_name, names->vbdev_name); + } + fprintf(fp, "\n"); +} + +/* Called when SPDK wants to output the bdev specific methods. */ +static void +vbdev_passthru_write_json_config(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct vbdev_passthru *pt_node = SPDK_CONTAINEROF(bdev, struct vbdev_passthru, pt_bdev); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_passthru_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev_name", spdk_bdev_get_name(pt_node->base_bdev)); + spdk_json_write_named_string(w, "passthru_bdev_name", spdk_bdev_get_name(bdev)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +/* When we register our bdev this is how we specify our entry points. */ +static const struct spdk_bdev_fn_table vbdev_passthru_fn_table = { + .destruct = vbdev_passthru_destruct, + .submit_request = vbdev_passthru_submit_request, + .io_type_supported = vbdev_passthru_io_type_supported, + .get_io_channel = vbdev_passthru_get_io_channel, + .dump_info_json = vbdev_passthru_info_config_json, + .write_config_json = vbdev_passthru_write_json_config, +}; + +/* Called when the underlying base bdev goes away. */ +static void +vbdev_passthru_base_bdev_hotremove_cb(void *ctx) +{ + struct vbdev_passthru *pt_node, *tmp; + struct spdk_bdev *bdev_find = ctx; + + TAILQ_FOREACH_SAFE(pt_node, &g_pt_nodes, link, tmp) { + if (bdev_find == pt_node->base_bdev) { + spdk_bdev_unregister(&pt_node->pt_bdev, NULL, NULL); + } + } +} + +/* Create and register the passthru vbdev if we find it in our list of bdev names. + * This can be called either by the examine path or RPC method. + */ +static void +vbdev_passthru_register(struct spdk_bdev *bdev) +{ + struct bdev_names *name; + struct vbdev_passthru *pt_node; + int rc; + + /* Check our list of names from config versus this bdev and if + * there's a match, create the pt_node & bdev accordingly. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->bdev_name, bdev->name) != 0) { + continue; + } + + SPDK_NOTICELOG("Match on %s\n", bdev->name); + pt_node = calloc(1, sizeof(struct vbdev_passthru)); + if (!pt_node) { + SPDK_ERRLOG("could not allocate pt_node\n"); + break; + } + + /* The base bdev that we're attaching to. */ + pt_node->base_bdev = bdev; + pt_node->pt_bdev.name = strdup(name->vbdev_name); + if (!pt_node->pt_bdev.name) { + SPDK_ERRLOG("could not allocate pt_bdev name\n"); + free(pt_node); + break; + } + pt_node->pt_bdev.product_name = "passthru"; + + /* Copy some properties from the underlying base bdev. */ + pt_node->pt_bdev.write_cache = bdev->write_cache; + pt_node->pt_bdev.need_aligned_buffer = bdev->need_aligned_buffer; + pt_node->pt_bdev.optimal_io_boundary = bdev->optimal_io_boundary; + pt_node->pt_bdev.blocklen = bdev->blocklen; + pt_node->pt_bdev.blockcnt = bdev->blockcnt; + + /* This is the context that is passed to us when the bdev + * layer calls in so we'll save our pt_bdev node here. + */ + pt_node->pt_bdev.ctxt = pt_node; + pt_node->pt_bdev.fn_table = &vbdev_passthru_fn_table; + pt_node->pt_bdev.module = &passthru_if; + TAILQ_INSERT_TAIL(&g_pt_nodes, pt_node, link); + + spdk_io_device_register(pt_node, pt_bdev_ch_create_cb, pt_bdev_ch_destroy_cb, + sizeof(struct pt_io_channel), + name->bdev_name); + SPDK_NOTICELOG("io_device created at: 0x%p\n", pt_node); + + rc = spdk_bdev_open(bdev, true, vbdev_passthru_base_bdev_hotremove_cb, + bdev, &pt_node->base_desc); + if (rc) { + SPDK_ERRLOG("could not open bdev %s\n", spdk_bdev_get_name(bdev)); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("bdev opened\n"); + + rc = spdk_bdev_module_claim_bdev(bdev, pt_node->base_desc, pt_node->pt_bdev.module); + if (rc) { + SPDK_ERRLOG("could not claim bdev %s\n", spdk_bdev_get_name(bdev)); + spdk_bdev_close(pt_node->base_desc); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("bdev claimed\n"); + + rc = spdk_vbdev_register(&pt_node->pt_bdev, &bdev, 1); + if (rc) { + SPDK_ERRLOG("could not register pt_bdev\n"); + spdk_bdev_close(pt_node->base_desc); + TAILQ_REMOVE(&g_pt_nodes, pt_node, link); + free(pt_node->pt_bdev.name); + free(pt_node); + break; + } + SPDK_NOTICELOG("pt_bdev registered\n"); + SPDK_NOTICELOG("created pt_bdev for: %s\n", name->vbdev_name); + } +} + +/* Create the passthru disk from the given bdev and vbdev name. */ +int +create_passthru_disk(const char *bdev_name, const char *vbdev_name) +{ + struct spdk_bdev *bdev = NULL; + int rc = 0; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + return -1; + } + + rc = vbdev_passthru_insert_name(bdev_name, vbdev_name); + if (rc != 0) { + return rc; + } + + vbdev_passthru_register(bdev); + + return 0; +} + +void +delete_passthru_disk(struct spdk_bdev *bdev, spdk_delete_passthru_complete cb_fn, void *cb_arg) +{ + struct bdev_names *name; + + if (!bdev || bdev->module != &passthru_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + /* Remove the association (vbdev, bdev) from g_bdev_names. This is required so that the + * vbdev does not get re-created if the same bdev is constructed at some other time, + * unless the underlying bdev was hot-removed. + */ + TAILQ_FOREACH(name, &g_bdev_names, link) { + if (strcmp(name->vbdev_name, bdev->name) == 0) { + TAILQ_REMOVE(&g_bdev_names, name, link); + free(name->bdev_name); + free(name->vbdev_name); + free(name); + break; + } + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +/* Because we specified this function in our pt bdev function table when we + * registered our pt bdev, we'll get this call anytime a new bdev shows up. + * Here we need to decide if we care about it and if so what to do. We + * parsed the config file at init so we check the new bdev against the list + * we built up at that time and if the user configured us to attach to this + * bdev, here's where we do it. + */ +static void +vbdev_passthru_examine(struct spdk_bdev *bdev) +{ + vbdev_passthru_register(bdev); + + spdk_bdev_module_examine_done(&passthru_if); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_passthru", SPDK_LOG_VBDEV_PASSTHRU) diff --git a/src/spdk/lib/bdev/passthru/vbdev_passthru.h b/src/spdk/lib/bdev/passthru/vbdev_passthru.h new file mode 100644 index 00000000..5705c4ed --- /dev/null +++ b/src/spdk/lib/bdev/passthru/vbdev_passthru.h @@ -0,0 +1,62 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_PASSTHRU_H +#define SPDK_VBDEV_PASSTHRU_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_passthru_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new pass through bdev. + * + * \param bdev_name Bdev on which pass through vbdev will be created. + * \param vbdev_name Vbdev name. + * \return 0 on success, other on failure. + */ +int create_passthru_disk(const char *bdev_name, const char *vbdev_name); + +/** + * Delete passthru bdev. + * + * \param bdev Pointer to pass through bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void delete_passthru_disk(struct spdk_bdev *bdev, spdk_delete_passthru_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_VBDEV_PASSTHRU_H */ diff --git a/src/spdk/lib/bdev/passthru/vbdev_passthru_rpc.c b/src/spdk/lib/bdev/passthru/vbdev_passthru_rpc.c new file mode 100644 index 00000000..9f0f9521 --- /dev/null +++ b/src/spdk/lib/bdev/passthru/vbdev_passthru_rpc.c @@ -0,0 +1,160 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vbdev_passthru.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +/* Structure to hold the parameters for this RPC method. */ +struct rpc_construct_passthru { + char *base_bdev_name; + char *passthru_bdev_name; +}; + +/* Free the allocated memory resource after the RPC handling. */ +static void +free_rpc_construct_passthru(struct rpc_construct_passthru *r) +{ + free(r->base_bdev_name); + free(r->passthru_bdev_name); +} + +/* Structure to decode the input parameters for this RPC method. */ +static const struct spdk_json_object_decoder rpc_construct_passthru_decoders[] = { + {"base_bdev_name", offsetof(struct rpc_construct_passthru, base_bdev_name), spdk_json_decode_string}, + {"passthru_bdev_name", offsetof(struct rpc_construct_passthru, passthru_bdev_name), spdk_json_decode_string}, +}; + +/* Decode the parameters for this RPC method and properly construct the passthru + * device. Error status returned in the failed cases. + */ +static void +spdk_rpc_construct_passthru_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_passthru req = {NULL}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_passthru_decoders, + SPDK_COUNTOF(rpc_construct_passthru_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_PASSTHRU, "spdk_json_decode_object failed\n"); + goto invalid; + } + + rc = create_passthru_disk(req.base_bdev_name, req.passthru_bdev_name); + if (rc != 0) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_passthru(&req); + return; + } + + spdk_json_write_string(w, req.passthru_bdev_name); + spdk_jsonrpc_end_result(request, w); + free_rpc_construct_passthru(&req); + return; + +invalid: + free_rpc_construct_passthru(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("construct_passthru_bdev", spdk_rpc_construct_passthru_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_passthru { + char *name; +}; + +static void +free_rpc_delete_passthru(struct rpc_delete_passthru *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_passthru_decoders[] = { + {"name", offsetof(struct rpc_delete_passthru, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_passthru_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_passthru_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_passthru req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_passthru_decoders, + SPDK_COUNTOF(rpc_delete_passthru_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + delete_passthru_disk(bdev, _spdk_rpc_delete_passthru_bdev_cb, request); + + free_rpc_delete_passthru(&req); + + return; + +invalid: + free_rpc_delete_passthru(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_passthru_bdev", spdk_rpc_delete_passthru_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/pmem/Makefile b/src/spdk/lib/bdev/pmem/Makefile new file mode 100644 index 00000000..19f0da8c --- /dev/null +++ b/src/spdk/lib/bdev/pmem/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_pmem.c bdev_pmem_rpc.c +LIBNAME = bdev_pmem + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/pmem/bdev_pmem.c b/src/spdk/lib/bdev/pmem/bdev_pmem.c new file mode 100644 index 00000000..9238e085 --- /dev/null +++ b/src/spdk/lib/bdev/pmem/bdev_pmem.c @@ -0,0 +1,465 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/conf.h" +#include "spdk/string.h" +#include "spdk/likely.h" +#include "spdk/util.h" +#include "spdk/rpc.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#include "bdev_pmem.h" +#include "libpmemblk.h" + +struct pmem_disk { + struct spdk_bdev disk; + PMEMblkpool *pool; + char pmem_file[NAME_MAX]; + TAILQ_ENTRY(pmem_disk) tailq; +}; + +static TAILQ_HEAD(, pmem_disk) g_pmem_disks = TAILQ_HEAD_INITIALIZER(g_pmem_disks); + +static int bdev_pmem_initialize(void); +static void bdev_pmem_finish(void); + +static struct spdk_bdev_module pmem_if = { + .name = "pmem", + .module_init = bdev_pmem_initialize, + .module_fini = bdev_pmem_finish, + .async_fini = true, + +}; + +SPDK_BDEV_MODULE_REGISTER(&pmem_if) + +typedef int(*spdk_bdev_pmem_io_request)(PMEMblkpool *pbp, void *buf, long long blockno); + +static int +_bdev_pmem_submit_io_read(PMEMblkpool *pbp, void *buf, long long blockno) +{ + return pmemblk_read(pbp, buf, blockno); +} + +static int +_bdev_pmem_submit_io_write(PMEMblkpool *pbp, void *buf, long long blockno) +{ + return pmemblk_write(pbp, buf, blockno); +} + +static int +bdev_pmem_destruct(void *ctx) +{ + struct pmem_disk *pdisk = ctx; + + TAILQ_REMOVE(&g_pmem_disks, pdisk, tailq); + free(pdisk->disk.name); + pmemblk_close(pdisk->pool); + free(pdisk); + + return 0; +} + +static int +bdev_pmem_check_iov_len(struct iovec *iovs, int iovcnt, size_t num_blocks, uint32_t block_size) +{ + size_t nbytes = num_blocks * block_size; + int i; + + for (i = 0; i < iovcnt; i++) { + if (spdk_unlikely(iovs[i].iov_base == NULL && iovs[i].iov_len != 0)) { + return -1; + } + + if (nbytes <= iovs[i].iov_len) { + return 0; + } + + if (spdk_unlikely(iovs[i].iov_len % block_size != 0)) { + return -1; + } + + nbytes -= iovs[i].iov_len; + } + + return -1; +} + +static void +bdev_pmem_submit_io(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, + struct spdk_io_channel *ch, + struct iovec *iov, int iovcnt, + uint64_t offset_blocks, size_t num_blocks, uint32_t block_size, + spdk_bdev_pmem_io_request fn) +{ + int rc; + size_t nbytes, offset, len; + enum spdk_bdev_io_status status; + + rc = bdev_pmem_check_iov_len(iov, iovcnt, num_blocks, block_size); + if (rc) { + status = SPDK_BDEV_IO_STATUS_FAILED; + goto end; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "io %lu bytes from offset %#lx\n", + num_blocks, offset_blocks); + + for (nbytes = num_blocks * block_size; nbytes > 0; iov++) { + len = spdk_min(iov->iov_len, nbytes); + nbytes -= len; + + offset = 0; + while (offset != len) { + rc = fn(pdisk->pool, iov->iov_base + offset, offset_blocks); + if (rc != 0) { + SPDK_ERRLOG("pmemblk io failed: %d (%s)\n", errno, pmemblk_errormsg()); + status = SPDK_BDEV_IO_STATUS_FAILED; + goto end; + } + + offset += block_size; + offset_blocks++; + } + } + + assert(num_blocks == offset_blocks - bdev_io->u.bdev.offset_blocks); + status = SPDK_BDEV_IO_STATUS_SUCCESS; +end: + + spdk_bdev_io_complete(bdev_io, status); +} + +static void +bdev_pmem_write_zeros(struct spdk_bdev_io *bdev_io, struct pmem_disk *pdisk, + struct spdk_io_channel *ch, uint64_t offset_blocks, + uint64_t num_blocks, uint32_t block_size) +{ + int rc; + enum spdk_bdev_io_status status = SPDK_BDEV_IO_STATUS_SUCCESS; + + while (num_blocks > 0) { + rc = pmemblk_set_zero(pdisk->pool, offset_blocks); + if (rc != 0) { + SPDK_ERRLOG("pmemblk_set_zero failed: %d (%s)\n", errno, pmemblk_errormsg()); + status = SPDK_BDEV_IO_STATUS_FAILED; + break; + } + offset_blocks++; + num_blocks--; + } + spdk_bdev_io_complete(bdev_io, status); +} + +static void +bdev_pmem_io_get_buf_cb(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io) +{ + bdev_pmem_submit_io(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen, + _bdev_pmem_submit_io_read); +} + +static void +bdev_pmem_submit_request(struct spdk_io_channel *channel, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_pmem_io_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_pmem_submit_io(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen, + _bdev_pmem_submit_io_write); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + bdev_pmem_write_zeros(bdev_io, + bdev_io->bdev->ctxt, + channel, + bdev_io->u.bdev.offset_blocks, + bdev_io->u.bdev.num_blocks, + bdev_io->bdev->blocklen); + break; + case SPDK_BDEV_IO_TYPE_RESET: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + break; + default: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_pmem_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_RESET: + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + return true; + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_pmem_get_io_channel(void *ctx) +{ + return spdk_get_io_channel(&g_pmem_disks); +} + +static int +bdev_pmem_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct pmem_disk *pdisk = ctx; + + spdk_json_write_name(w, "pmem"); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "pmem_file"); + spdk_json_write_string(w, pdisk->pmem_file); + spdk_json_write_object_end(w); + + return 0; +} + +static int +bdev_pmem_create_cb(void *io_device, void *ctx_buf) +{ + return 0; +} + +static void +bdev_pmem_destroy_cb(void *io_device, void *ctx_buf) +{ +} + +static void +bdev_pmem_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct pmem_disk *disk = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_pmem_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "pmem_file", disk->pmem_file); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table pmem_fn_table = { + .destruct = bdev_pmem_destruct, + .submit_request = bdev_pmem_submit_request, + .io_type_supported = bdev_pmem_io_type_supported, + .get_io_channel = bdev_pmem_get_io_channel, + .dump_info_json = bdev_pmem_dump_info_json, + .write_config_json = bdev_pmem_write_config_json, +}; + +int +spdk_create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev) +{ + uint64_t num_blocks; + uint32_t block_size; + struct pmem_disk *pdisk; + int rc; + + *bdev = NULL; + + if (name == NULL) { + SPDK_ERRLOG("Missing name parameter for spdk_create_pmem_disk()\n"); + return EINVAL; + } + + if (pmemblk_check(pmem_file, 0) != 1) { + SPDK_ERRLOG("Pool '%s' check failed: %s\n", pmem_file, pmemblk_errormsg()); + return EIO; + } + + pdisk = calloc(1, sizeof(*pdisk)); + if (!pdisk) { + return ENOMEM; + } + + snprintf(pdisk->pmem_file, sizeof(pdisk->pmem_file), "%s", pmem_file); + pdisk->pool = pmemblk_open(pmem_file, 0); + if (!pdisk->pool) { + SPDK_ERRLOG("Opening pmem pool '%s' failed: %d\n", pmem_file, errno); + free(pdisk); + return errno; + } + + block_size = pmemblk_bsize(pdisk->pool); + num_blocks = pmemblk_nblock(pdisk->pool); + + if (block_size == 0) { + SPDK_ERRLOG("Block size must be more than 0 bytes\n"); + pmemblk_close(pdisk->pool); + free(pdisk); + return EINVAL; + } + + if (num_blocks == 0) { + SPDK_ERRLOG("Disk must be more than 0 blocks\n"); + pmemblk_close(pdisk->pool); + free(pdisk); + return EINVAL; + } + + pdisk->disk.name = strdup(name); + if (!pdisk->disk.name) { + pmemblk_close(pdisk->pool); + free(pdisk); + return ENOMEM; + } + + pdisk->disk.product_name = "pmemblk disk"; + pdisk->disk.write_cache = 0; + pdisk->disk.blocklen = block_size; + pdisk->disk.blockcnt = num_blocks; + + pdisk->disk.ctxt = pdisk; + pdisk->disk.fn_table = &pmem_fn_table; + pdisk->disk.module = &pmem_if; + + rc = spdk_bdev_register(&pdisk->disk); + if (rc) { + pmemblk_close(pdisk->pool); + free(pdisk->disk.name); + free(pdisk); + return rc; + } + + TAILQ_INSERT_TAIL(&g_pmem_disks, pdisk, tailq); + + *bdev = &pdisk->disk; + + return 0; +} + +void +spdk_delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &pmem_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static void +bdev_pmem_read_conf(void) +{ + struct spdk_conf_section *sp; + struct spdk_bdev *bdev; + const char *pmem_file; + const char *bdev_name; + int i; + + sp = spdk_conf_find_section(NULL, "Pmem"); + if (sp == NULL) { + return; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "Blk", i)) { + break; + } + + pmem_file = spdk_conf_section_get_nmval(sp, "Blk", i, 0); + if (pmem_file == NULL) { + SPDK_ERRLOG("Pmem: missing filename\n"); + continue; + } + + bdev_name = spdk_conf_section_get_nmval(sp, "Blk", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("Pmem: missing bdev name\n"); + continue; + } + + spdk_create_pmem_disk(pmem_file, bdev_name, &bdev); + } +} + +static int +bdev_pmem_initialize(void) +{ + const char *err = pmemblk_check_version(PMEMBLK_MAJOR_VERSION, PMEMBLK_MINOR_VERSION); + + if (err != NULL) { + SPDK_ERRLOG("Invalid libpmemblk version (expected %d.%d): %s\n", PMEMBLK_MAJOR_VERSION, + PMEMBLK_MINOR_VERSION, err); + return -1; + } + + spdk_io_device_register(&g_pmem_disks, bdev_pmem_create_cb, bdev_pmem_destroy_cb, 0, "pmem_bdev"); + + bdev_pmem_read_conf(); + + return 0; + +} + +static void +bdev_pmem_finish_done(void *io_device) +{ + spdk_bdev_module_finish_done(); +} + +static void +bdev_pmem_finish(void) +{ + spdk_io_device_unregister(&g_pmem_disks, bdev_pmem_finish_done); +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_pmem", SPDK_LOG_BDEV_PMEM) diff --git a/src/spdk/lib/bdev/pmem/bdev_pmem.h b/src/spdk/lib/bdev/pmem/bdev_pmem.h new file mode 100644 index 00000000..7814166c --- /dev/null +++ b/src/spdk/lib/bdev/pmem/bdev_pmem.h @@ -0,0 +1,64 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_PMEM_H +#define SPDK_BDEV_PMEM_H + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_pmem_complete)(void *cb_arg, int bdeverrno); + +/** + * Create new pmem bdev. + * + * \param pmem_file Pointer to pmem pool file. + * \param name Bdev name. + * \param bdev output parameter for bdev when operation is successful. + * \return 0 on success. + * -EIO if pool check failed + * -EINVAL if input parameters check failed + * -ENOMEM if buffer cannot be allocated + */ +int spdk_create_pmem_disk(const char *pmem_file, const char *name, struct spdk_bdev **bdev); + +/** + * Delete pmem bdev. + * + * \param bdev Pointer to pmem bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void spdk_delete_pmem_disk(struct spdk_bdev *bdev, spdk_delete_pmem_complete cb_fn, + void *cb_arg); + +#endif /* SPDK_BDEV_PMEM_H */ diff --git a/src/spdk/lib/bdev/pmem/bdev_pmem_rpc.c b/src/spdk/lib/bdev/pmem/bdev_pmem_rpc.c new file mode 100644 index 00000000..3156cffb --- /dev/null +++ b/src/spdk/lib/bdev/pmem/bdev_pmem_rpc.c @@ -0,0 +1,350 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_pmem.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "libpmemblk.h" + +#include "spdk_internal/log.h" + +struct rpc_construct_pmem { + char *pmem_file; + char *name; +}; + +static void +free_rpc_construct_pmem_bdev(struct rpc_construct_pmem *req) +{ + free(req->pmem_file); + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_construct_pmem_decoders[] = { + {"pmem_file", offsetof(struct rpc_construct_pmem, pmem_file), spdk_json_decode_string}, + {"name", offsetof(struct rpc_construct_pmem, name), spdk_json_decode_string}, +}; + +static void +spdk_rpc_construct_pmem_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_pmem req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_pmem_decoders, + SPDK_COUNTOF(rpc_construct_pmem_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + rc = EINVAL; + goto invalid; + } + rc = spdk_create_pmem_disk(req.pmem_file, req.name, &bdev); + if (rc != 0) { + goto invalid; + } + if (bdev == NULL) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_construct_pmem_bdev(&req); + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + + free_rpc_construct_pmem_bdev(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(rc)); + free_rpc_construct_pmem_bdev(&req); +} +SPDK_RPC_REGISTER("construct_pmem_bdev", spdk_rpc_construct_pmem_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_pmem { + char *name; +}; + +static void +free_rpc_delete_pmem(struct rpc_delete_pmem *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_pmem_decoders[] = { + {"name", offsetof(struct rpc_delete_pmem, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_pmem_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_pmem_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_pmem req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_pmem_decoders, + SPDK_COUNTOF(rpc_delete_pmem_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + spdk_delete_pmem_disk(bdev, _spdk_rpc_delete_pmem_bdev_cb, request); + free_rpc_delete_pmem(&req); + return; + +invalid: + free_rpc_delete_pmem(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_pmem_bdev", spdk_rpc_delete_pmem_bdev, SPDK_RPC_RUNTIME) + +struct rpc_create_pmem_pool { + char *pmem_file; + uint64_t num_blocks; + uint32_t block_size; +}; + +static const struct spdk_json_object_decoder rpc_create_pmem_pool_decoders[] = { + {"pmem_file", offsetof(struct rpc_create_pmem_pool, pmem_file), spdk_json_decode_string}, + {"num_blocks", offsetof(struct rpc_create_pmem_pool, num_blocks), spdk_json_decode_uint64}, + {"block_size", offsetof(struct rpc_create_pmem_pool, block_size), spdk_json_decode_uint32}, +}; + +static void +free_rpc_create_pmem_pool(struct rpc_create_pmem_pool *req) +{ + free(req->pmem_file); +} + +static void +spdk_rpc_create_pmem_pool(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_create_pmem_pool req = {}; + struct spdk_json_write_ctx *w; + uint64_t pool_size; + PMEMblkpool *pbp; + + if (spdk_json_decode_object(params, rpc_create_pmem_pool_decoders, + SPDK_COUNTOF(rpc_create_pmem_pool_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + goto invalid; + } + + /* libpmemblk pool has to contain at least 256 blocks */ + if (req.num_blocks < 256) { + goto invalid; + } + + pool_size = req.num_blocks * req.block_size; + if (pool_size < PMEMBLK_MIN_POOL) { + goto invalid; + } + + pbp = pmemblk_create(req.pmem_file, req.block_size, pool_size, 0666); + if (pbp == NULL) { + goto invalid; + } + + pmemblk_close(pbp); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_create_pmem_pool(&req); + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + free_rpc_create_pmem_pool(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_create_pmem_pool(&req); +} +SPDK_RPC_REGISTER("create_pmem_pool", spdk_rpc_create_pmem_pool, SPDK_RPC_RUNTIME) + +struct rpc_pmem_pool_info { + char *pmem_file; +}; + +static const struct spdk_json_object_decoder rpc_pmem_pool_info_decoders[] = { + {"pmem_file", offsetof(struct rpc_pmem_pool_info, pmem_file), spdk_json_decode_string}, +}; + +static void +free_rpc_pmem_pool_info(struct rpc_pmem_pool_info *req) +{ + free(req->pmem_file); +} + +static void +spdk_rpc_pmem_pool_info(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_pmem_pool_info req = {}; + struct spdk_json_write_ctx *w; + size_t num_blocks, block_size; + PMEMblkpool *pbp; + + if (spdk_json_decode_object(params, rpc_pmem_pool_info_decoders, + SPDK_COUNTOF(rpc_pmem_pool_info_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + goto invalid; + } + + pbp = pmemblk_open(req.pmem_file, 0); + if (pbp == NULL) { + goto invalid; + } + + block_size = pmemblk_bsize(pbp); + num_blocks = pmemblk_nblock(pbp); + + + pmemblk_close(pbp); + + /* Check pmem pool consistency */ + if (pmemblk_check(req.pmem_file, block_size) != 1) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_pmem_pool_info(&req); + return; + } + + spdk_json_write_array_begin(w); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "num_blocks"); + spdk_json_write_uint64(w, num_blocks); + spdk_json_write_name(w, "block_size"); + spdk_json_write_uint64(w, block_size); + spdk_json_write_object_end(w); + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free_rpc_pmem_pool_info(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_pmem_pool_info(&req); +} +SPDK_RPC_REGISTER("pmem_pool_info", spdk_rpc_pmem_pool_info, SPDK_RPC_RUNTIME) + +struct rpc_delete_pmem_pool { + char *pmem_file; +}; + +static const struct spdk_json_object_decoder rpc_delete_pmem_pool_decoders[] = { + {"pmem_file", offsetof(struct rpc_delete_pmem_pool, pmem_file), spdk_json_decode_string}, +}; + +static void +free_rpc_delete_pmem_pool(struct rpc_delete_pmem_pool *req) +{ + free(req->pmem_file); +} + +static void +spdk_rpc_delete_pmem_pool(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_pmem_pool req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_delete_pmem_pool_decoders, + SPDK_COUNTOF(rpc_delete_pmem_pool_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_PMEM, "spdk_json_decode_object failed\n"); + goto invalid; + } + + /* Check if file is actually pmem pool */ + if (pmemblk_check(req.pmem_file, 0) != 1) { + goto invalid; + } + + unlink(req.pmem_file); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_delete_pmem_pool(&req); + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + free_rpc_delete_pmem_pool(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_delete_pmem_pool(&req); +} +SPDK_RPC_REGISTER("delete_pmem_pool", spdk_rpc_delete_pmem_pool, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/raid/Makefile b/src/spdk/lib/bdev/raid/Makefile new file mode 100644 index 00000000..8332399d --- /dev/null +++ b/src/spdk/lib/bdev/raid/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib/bdev/ +C_SRCS = bdev_raid.c bdev_raid_rpc.c +LIBNAME = vbdev_raid + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/raid/bdev_raid.c b/src/spdk/lib/bdev/raid/bdev_raid.c new file mode 100644 index 00000000..51fa94ec --- /dev/null +++ b/src/spdk/lib/bdev/raid/bdev_raid.c @@ -0,0 +1,1624 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_raid.h" +#include "spdk/env.h" +#include "spdk/io_channel.h" +#include "spdk/conf.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" +#include "spdk/string.h" + +static bool g_shutdown_started = false; + +/* raid bdev config as read from config file */ +struct raid_config g_spdk_raid_config = { + .raid_bdev_config_head = TAILQ_HEAD_INITIALIZER(g_spdk_raid_config.raid_bdev_config_head), +}; + +/* + * List of raid bdev in configured list, these raid bdevs are registered with + * bdev layer + */ +struct spdk_raid_configured_tailq g_spdk_raid_bdev_configured_list; + +/* List of raid bdev in configuring list */ +struct spdk_raid_configuring_tailq g_spdk_raid_bdev_configuring_list; + +/* List of all raid bdevs */ +struct spdk_raid_all_tailq g_spdk_raid_bdev_list; + +/* List of all raid bdevs that are offline */ +struct spdk_raid_offline_tailq g_spdk_raid_bdev_offline_list; + +/* Function declarations */ +static void raid_bdev_examine(struct spdk_bdev *bdev); +static int raid_bdev_init(void); +static void raid_bdev_waitq_io_process(void *ctx); +static void raid_bdev_deconfigure(struct raid_bdev *raid_bdev); + + +/* + * brief: + * raid_bdev_create_cb function is a cb function for raid bdev which creates the + * hierarchy from raid bdev to base bdev io channels. It will be called per core + * params: + * io_device - pointer to raid bdev io device represented by raid_bdev + * ctx_buf - pointer to context buffer for raid bdev io channel + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_create_cb(void *io_device, void *ctx_buf) +{ + struct raid_bdev *raid_bdev = io_device; + struct raid_bdev_io_channel *raid_ch = ctx_buf; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_create_cb, %p\n", raid_ch); + + assert(raid_bdev != NULL); + assert(raid_bdev->state == RAID_BDEV_STATE_ONLINE); + + raid_ch->base_channel = calloc(raid_bdev->num_base_bdevs, + sizeof(struct spdk_io_channel *)); + if (!raid_ch->base_channel) { + SPDK_ERRLOG("Unable to allocate base bdevs io channel\n"); + return -ENOMEM; + } + for (uint32_t i = 0; i < raid_bdev->num_base_bdevs; i++) { + /* + * Get the spdk_io_channel for all the base bdevs. This is used during + * split logic to send the respective child bdev ios to respective base + * bdev io channel. + */ + raid_ch->base_channel[i] = spdk_bdev_get_io_channel( + raid_bdev->base_bdev_info[i].desc); + if (!raid_ch->base_channel[i]) { + for (uint32_t j = 0; j < i; j++) { + spdk_put_io_channel(raid_ch->base_channel[j]); + } + free(raid_ch->base_channel); + SPDK_ERRLOG("Unable to create io channel for base bdev\n"); + return -ENOMEM; + } + } + + return 0; +} + +/* + * brief: + * raid_bdev_destroy_cb function is a cb function for raid bdev which deletes the + * hierarchy from raid bdev to base bdev io channels. It will be called per core + * params: + * io_device - pointer to raid bdev io device represented by raid_bdev + * ctx_buf - pointer to context buffer for raid bdev io channel + * returns: + * none + */ +static void +raid_bdev_destroy_cb(void *io_device, void *ctx_buf) +{ + struct raid_bdev_io_channel *raid_ch = ctx_buf; + struct raid_bdev *raid_bdev = io_device; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destroy_cb\n"); + + assert(raid_bdev != NULL); + assert(raid_ch != NULL); + assert(raid_ch->base_channel); + for (uint32_t i = 0; i < raid_bdev->num_base_bdevs; i++) { + /* Free base bdev channels */ + assert(raid_ch->base_channel[i] != NULL); + spdk_put_io_channel(raid_ch->base_channel[i]); + raid_ch->base_channel[i] = NULL; + } + free(raid_ch->base_channel); + raid_ch->base_channel = NULL; +} + +/* + * brief: + * raid_bdev_cleanup is used to cleanup and free raid_bdev related data + * structures. + * params: + * raid_bdev - pointer to raid_bdev + * returns: + * none + */ +void +raid_bdev_cleanup(struct raid_bdev *raid_bdev) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_cleanup, %p name %s, state %u, config %p\n", + raid_bdev, + raid_bdev->bdev.name, raid_bdev->state, raid_bdev->config); + if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) { + TAILQ_REMOVE(&g_spdk_raid_bdev_configuring_list, raid_bdev, state_link); + } else if (raid_bdev->state == RAID_BDEV_STATE_OFFLINE) { + TAILQ_REMOVE(&g_spdk_raid_bdev_offline_list, raid_bdev, state_link); + } else { + assert(0); + } + TAILQ_REMOVE(&g_spdk_raid_bdev_list, raid_bdev, global_link); + free(raid_bdev->bdev.name); + raid_bdev->bdev.name = NULL; + assert(raid_bdev->base_bdev_info); + free(raid_bdev->base_bdev_info); + raid_bdev->base_bdev_info = NULL; + if (raid_bdev->config) { + raid_bdev->config->raid_bdev = NULL; + } + free(raid_bdev); +} + +/* + * brief: + * free resource of base bdev for raid bdev + * params: + * raid_bdev - pointer to raid bdev + * base_bdev_slot - position to base bdev in raid bdev + * returns: + * 0 - success + * non zero - failure + */ +void +raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint32_t base_bdev_slot) +{ + struct raid_base_bdev_info *info; + + info = &raid_bdev->base_bdev_info[base_bdev_slot]; + + spdk_bdev_module_release_bdev(info->bdev); + spdk_bdev_close(info->desc); + info->desc = NULL; + info->bdev = NULL; + + assert(raid_bdev->num_base_bdevs_discovered); + raid_bdev->num_base_bdevs_discovered--; +} + +/* + * brief: + * raid_bdev_destruct is the destruct function table pointer for raid bdev + * params: + * ctxt - pointer to raid_bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_destruct(void *ctxt) +{ + struct raid_bdev *raid_bdev = ctxt; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_destruct\n"); + + raid_bdev->destruct_called = true; + for (uint16_t i = 0; i < raid_bdev->num_base_bdevs; i++) { + /* + * Close all base bdev descriptors for which call has come from below + * layers. Also close the descriptors if we have started shutdown. + */ + if (g_shutdown_started || + ((raid_bdev->base_bdev_info[i].remove_scheduled == true) && + (raid_bdev->base_bdev_info[i].bdev != NULL))) { + raid_bdev_free_base_bdev_resource(raid_bdev, i); + } + } + + if (g_shutdown_started) { + TAILQ_REMOVE(&g_spdk_raid_bdev_configured_list, raid_bdev, state_link); + raid_bdev->state = RAID_BDEV_STATE_OFFLINE; + TAILQ_INSERT_TAIL(&g_spdk_raid_bdev_offline_list, raid_bdev, state_link); + spdk_io_device_unregister(raid_bdev, NULL); + } + + if (raid_bdev->num_base_bdevs_discovered == 0) { + /* Free raid_bdev when there are no base bdevs left */ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev base bdevs is 0, going to free all in destruct\n"); + raid_bdev_cleanup(raid_bdev); + } + + return 0; +} + +/* + * brief: + * raid_bdev_io_completion function is called by lower layers to notify raid + * module that particular bdev_io is completed. + * params: + * bdev_io - pointer to bdev io submitted to lower layers, like child io + * success - bdev_io status + * cb_arg - function callback context, like parent io pointer + * returns: + * none + */ +static void +raid_bdev_io_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + + spdk_bdev_free_io(bdev_io); + + if (success) { + spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(parent_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +/* + * brief: + * raid_bdev_submit_rw_request function is used to submit I/O to the correct + * member disk + * params: + * bdev_io - parent bdev io + * start_strip - start strip number of this io + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_submit_rw_request(struct spdk_bdev_io *bdev_io, uint64_t start_strip) +{ + struct raid_bdev_io *raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + struct raid_bdev_io_channel *raid_ch = spdk_io_channel_get_ctx(raid_io->ch); + struct raid_bdev *raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt; + uint64_t pd_strip; + uint32_t offset_in_strip; + uint64_t pd_lba; + uint64_t pd_blocks; + uint32_t pd_idx; + int ret = 0; + + pd_strip = start_strip / raid_bdev->num_base_bdevs; + pd_idx = start_strip % raid_bdev->num_base_bdevs; + offset_in_strip = bdev_io->u.bdev.offset_blocks & (raid_bdev->strip_size - 1); + pd_lba = (pd_strip << raid_bdev->strip_size_shift) + offset_in_strip; + pd_blocks = bdev_io->u.bdev.num_blocks; + if (raid_bdev->base_bdev_info[pd_idx].desc == NULL) { + SPDK_ERRLOG("base bdev desc null for pd_idx %u\n", pd_idx); + assert(0); + } + + /* + * Submit child io to bdev layer with using base bdev descriptors, base + * bdev lba, base bdev child io length in blocks, buffer, completion + * function and function callback context + */ + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + ret = spdk_bdev_readv_blocks(raid_bdev->base_bdev_info[pd_idx].desc, + raid_ch->base_channel[pd_idx], + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + pd_lba, pd_blocks, raid_bdev_io_completion, + bdev_io); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + ret = spdk_bdev_writev_blocks(raid_bdev->base_bdev_info[pd_idx].desc, + raid_ch->base_channel[pd_idx], + bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + pd_lba, pd_blocks, raid_bdev_io_completion, + bdev_io); + } else { + SPDK_ERRLOG("Recvd not supported io type %u\n", bdev_io->type); + assert(0); + } + + return ret; +} + +/* + * brief: + * get_curr_base_bdev_index function calculates the base bdev index + * params: + * raid_bdev - pointer to pooled bdev + * raid_io - pointer to parent io context + * returns: + * base bdev index + */ +static uint8_t +get_curr_base_bdev_index(struct raid_bdev *raid_bdev, struct raid_bdev_io *raid_io) +{ + struct spdk_bdev_io *bdev_io; + uint64_t start_strip; + + bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx); + start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; + + return (start_strip % raid_bdev->num_base_bdevs); +} + +/* + * brief: + * raid_bdev_io_submit_fail_process function processes the IO which failed to submit. + * It will try to queue the IOs after storing the context to bdev wait queue logic. + * params: + * bdev_io - pointer to bdev_io + * raid_io - pointer to raid bdev io + * ret - return code + * returns: + * none + */ +static void +raid_bdev_io_submit_fail_process(struct raid_bdev *raid_bdev, struct spdk_bdev_io *bdev_io, + struct raid_bdev_io *raid_io, int ret) +{ + struct raid_bdev_io_channel *raid_ch; + uint8_t pd_idx; + + if (ret != -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else { + /* Queue the IO to bdev layer wait queue */ + pd_idx = get_curr_base_bdev_index(raid_bdev, raid_io); + raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[pd_idx].bdev; + raid_io->waitq_entry.cb_fn = raid_bdev_waitq_io_process; + raid_io->waitq_entry.cb_arg = raid_io; + raid_ch = spdk_io_channel_get_ctx(raid_io->ch); + if (spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[pd_idx].bdev, + raid_ch->base_channel[pd_idx], + &raid_io->waitq_entry) != 0) { + SPDK_ERRLOG("bdev io waitq error, it should not happen\n"); + assert(0); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +/* + * brief: + * raid_bdev_waitq_io_process function is the callback function + * registered by raid bdev module to bdev when bdev_io was unavailable. + * params: + * ctx - pointer to raid_bdev_io + * returns: + * none + */ +static void +raid_bdev_waitq_io_process(void *ctx) +{ + struct raid_bdev_io *raid_io = ctx; + struct spdk_bdev_io *bdev_io; + struct raid_bdev *raid_bdev; + int ret; + uint64_t start_strip; + + bdev_io = SPDK_CONTAINEROF(raid_io, struct spdk_bdev_io, driver_ctx); + /* + * Try to submit childs of parent bdev io. If failed due to resource + * crunch then break the loop and don't try to process other queued IOs. + */ + raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt; + start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; + ret = raid_bdev_submit_rw_request(bdev_io, start_strip); + if (ret != 0) { + raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret); + } +} + +/* + * brief: + * raid_bdev_start_rw_request function is the submit_request function for + * read/write requests + * params: + * ch - pointer to raid bdev io channel + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +raid_bdev_start_rw_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct raid_bdev_io *raid_io; + struct raid_bdev *raid_bdev; + uint64_t start_strip = 0; + uint64_t end_strip = 0; + int ret; + + raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt; + raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + raid_io->ch = ch; + start_strip = bdev_io->u.bdev.offset_blocks >> raid_bdev->strip_size_shift; + end_strip = (bdev_io->u.bdev.offset_blocks + bdev_io->u.bdev.num_blocks - 1) >> + raid_bdev->strip_size_shift; + if (start_strip != end_strip && raid_bdev->num_base_bdevs > 1) { + assert(false); + SPDK_ERRLOG("I/O spans strip boundary!\n"); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + ret = raid_bdev_submit_rw_request(bdev_io, start_strip); + if (ret != 0) { + raid_bdev_io_submit_fail_process(raid_bdev, bdev_io, raid_io, ret); + } +} + +/* + * brief: + * raid_bdev_reset_completion is the completion callback for member disk resets + * params: + * bdev_io - pointer to member disk reset bdev_io + * success - true if reset was successful, false if unsuccessful + * cb_arg - callback argument (parent reset bdev_io) + * returns: + * none + */ +static void +raid_bdev_reset_completion(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_bdev_io *parent_io = cb_arg; + struct raid_bdev *raid_bdev = (struct raid_bdev *)parent_io->bdev->ctxt; + struct raid_bdev_io *raid_io = (struct raid_bdev_io *)parent_io->driver_ctx; + + spdk_bdev_free_io(bdev_io); + + if (!success) { + raid_io->base_bdev_reset_status = SPDK_BDEV_IO_STATUS_FAILED; + } + + raid_io->base_bdev_reset_completed++; + if (raid_io->base_bdev_reset_completed == raid_bdev->num_base_bdevs) { + spdk_bdev_io_complete(parent_io, raid_io->base_bdev_reset_status); + } +} + +/* + * brief: + * _raid_bdev_submit_reset_request_next function submits the next batch of reset requests + * to member disks; it will submit as many as possible unless a reset fails with -ENOMEM, in + * which case it will queue it for later submission + * params: + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +_raid_bdev_submit_reset_request_next(void *_bdev_io) +{ + struct spdk_bdev_io *bdev_io = _bdev_io; + struct raid_bdev_io *raid_io; + struct raid_bdev *raid_bdev; + struct raid_bdev_io_channel *raid_ch; + int ret; + uint8_t i; + + raid_bdev = (struct raid_bdev *)bdev_io->bdev->ctxt; + raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + raid_ch = spdk_io_channel_get_ctx(raid_io->ch); + + while (raid_io->base_bdev_reset_submitted < raid_bdev->num_base_bdevs) { + i = raid_io->base_bdev_reset_submitted; + ret = spdk_bdev_reset(raid_bdev->base_bdev_info[i].desc, + raid_ch->base_channel[i], + raid_bdev_reset_completion, bdev_io); + if (ret == 0) { + raid_io->base_bdev_reset_submitted++; + } else if (ret == -ENOMEM) { + raid_io->waitq_entry.bdev = raid_bdev->base_bdev_info[i].bdev; + raid_io->waitq_entry.cb_fn = _raid_bdev_submit_reset_request_next; + raid_io->waitq_entry.cb_arg = bdev_io; + spdk_bdev_queue_io_wait(raid_bdev->base_bdev_info[i].bdev, + raid_ch->base_channel[i], + &raid_io->waitq_entry); + return; + } else { + assert(false); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + } +} + +/* + * brief: + * _raid_bdev_submit_reset_request function is the submit_request function for + * reset requests + * params: + * ch - pointer to raid bdev io channel + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +_raid_bdev_submit_reset_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct raid_bdev_io *raid_io; + + raid_io = (struct raid_bdev_io *)bdev_io->driver_ctx; + raid_io->ch = ch; + raid_io->base_bdev_reset_submitted = 0; + raid_io->base_bdev_reset_completed = 0; + raid_io->base_bdev_reset_status = SPDK_BDEV_IO_STATUS_SUCCESS; + _raid_bdev_submit_reset_request_next(bdev_io); +} + +/* + * brief: + * raid_bdev_submit_request function is the submit_request function pointer of + * raid bdev function table. This is used to submit the io on raid_bdev to below + * layers. + * params: + * ch - pointer to raid bdev io channel + * bdev_io - pointer to parent bdev_io on raid bdev device + * returns: + * none + */ +static void +raid_bdev_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + if (bdev_io->u.bdev.iovs[0].iov_base == NULL) { + spdk_bdev_io_get_buf(bdev_io, raid_bdev_start_rw_request, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + } else { + /* Just call it directly if iov_base is already populated. */ + raid_bdev_start_rw_request(ch, bdev_io); + } + break; + case SPDK_BDEV_IO_TYPE_WRITE: + raid_bdev_start_rw_request(ch, bdev_io); + break; + + case SPDK_BDEV_IO_TYPE_FLUSH: + // TODO: support flush if requirement comes + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + break; + + case SPDK_BDEV_IO_TYPE_RESET: + _raid_bdev_submit_reset_request(ch, bdev_io); + break; + + default: + SPDK_ERRLOG("submit request, invalid io type %u\n", bdev_io->type); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + break; + } + +} + +/* + * brief: + * raid_bdev_io_type_supported is the io_supported function for bdev function + * table which returns whether the particular io type is supported or not by + * raid bdev module + * params: + * ctx - pointer to raid bdev context + * type - io type + * returns: + * true - io_type is supported + * false - io_type is not supported + */ +static bool +raid_bdev_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + default: + return false; + } + + return false; +} + +/* + * brief: + * raid_bdev_get_io_channel is the get_io_channel function table pointer for + * raid bdev. This is used to return the io channel for this raid bdev + * params: + * ctxt - pointer to raid_bdev + * returns: + * pointer to io channel for raid bdev + */ +static struct spdk_io_channel * +raid_bdev_get_io_channel(void *ctxt) +{ + struct raid_bdev *raid_bdev = ctxt; + + return spdk_get_io_channel(raid_bdev); +} + +/* + * brief: + * raid_bdev_dump_info_json is the function table pointer for raid bdev + * params: + * ctx - pointer to raid_bdev + * w - pointer to json context + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct raid_bdev *raid_bdev = ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_dump_config_json\n"); + assert(raid_bdev != NULL); + + /* Dump the raid bdev configuration related information */ + spdk_json_write_name(w, "raid"); + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size); + spdk_json_write_named_uint32(w, "state", raid_bdev->state); + spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level); + spdk_json_write_named_uint32(w, "destruct_called", raid_bdev->destruct_called); + spdk_json_write_named_uint32(w, "num_base_bdevs", raid_bdev->num_base_bdevs); + spdk_json_write_named_uint32(w, "num_base_bdevs_discovered", raid_bdev->num_base_bdevs_discovered); + spdk_json_write_name(w, "base_bdevs_list"); + spdk_json_write_array_begin(w); + for (uint16_t i = 0; i < raid_bdev->num_base_bdevs; i++) { + if (raid_bdev->base_bdev_info[i].bdev) { + spdk_json_write_string(w, raid_bdev->base_bdev_info[i].bdev->name); + } else { + spdk_json_write_null(w); + } + } + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + + return 0; +} + +/* + * brief: + * raid_bdev_write_config_json is the function table pointer for raid bdev + * params: + * bdev - pointer to spdk_bdev + * w - pointer to json context + * returns: + * none + */ +static void +raid_bdev_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct raid_bdev *raid_bdev = bdev->ctxt; + struct spdk_bdev *base; + uint16_t i; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_raid_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_uint32(w, "strip_size", raid_bdev->strip_size); + spdk_json_write_named_uint32(w, "raid_level", raid_bdev->raid_level); + + spdk_json_write_named_array_begin(w, "base_bdevs"); + for (i = 0; i < raid_bdev->num_base_bdevs; i++) { + base = raid_bdev->base_bdev_info[i].bdev; + if (base) { + spdk_json_write_string(w, base->name); + } + } + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +/* g_raid_bdev_fn_table is the function table for raid bdev */ +static const struct spdk_bdev_fn_table g_raid_bdev_fn_table = { + .destruct = raid_bdev_destruct, + .submit_request = raid_bdev_submit_request, + .io_type_supported = raid_bdev_io_type_supported, + .get_io_channel = raid_bdev_get_io_channel, + .dump_info_json = raid_bdev_dump_info_json, + .write_config_json = raid_bdev_write_config_json, +}; + +/* + * brief: + * raid_bdev_config_cleanup function is used to free memory for one raid_bdev in configuration + * params: + * raid_cfg - pointer to raid_bdev_config structure + * returns: + * none + */ +void +raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg) +{ + uint32_t i; + + TAILQ_REMOVE(&g_spdk_raid_config.raid_bdev_config_head, raid_cfg, link); + g_spdk_raid_config.total_raid_bdev--; + + if (raid_cfg->base_bdev) { + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + free(raid_cfg->base_bdev[i].name); + } + free(raid_cfg->base_bdev); + } + free(raid_cfg->name); + free(raid_cfg); +} + +/* + * brief: + * raid_bdev_free is the raid bdev function table function pointer. This is + * called on bdev free path + * params: + * none + * returns: + * none + */ +static void +raid_bdev_free(void) +{ + struct raid_bdev_config *raid_cfg, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_free\n"); + TAILQ_FOREACH_SAFE(raid_cfg, &g_spdk_raid_config.raid_bdev_config_head, link, tmp) { + raid_bdev_config_cleanup(raid_cfg); + } +} + +/* brief + * raid_bdev_config_find_by_name is a helper function to find raid bdev config + * by name as key. + * + * params: + * raid_name - name for raid bdev. + */ +struct raid_bdev_config * +raid_bdev_config_find_by_name(const char *raid_name) +{ + struct raid_bdev_config *raid_cfg; + + TAILQ_FOREACH(raid_cfg, &g_spdk_raid_config.raid_bdev_config_head, link) { + if (!strcmp(raid_cfg->name, raid_name)) { + return raid_cfg; + } + } + + return raid_cfg; +} + +/* + * brief + * raid_bdev_config_add function adds config for newly created raid bdev. + * + * params: + * raid_name - name for raid bdev. + * strip_size - strip size in KB + * num_base_bdevs - number of base bdevs. + * raid_level - raid level, only raid level 0 is supported. + * _raid_cfg - Pointer to newly added configuration + */ +int +raid_bdev_config_add(const char *raid_name, int strip_size, int num_base_bdevs, + int raid_level, struct raid_bdev_config **_raid_cfg) +{ + struct raid_bdev_config *raid_cfg; + + raid_cfg = raid_bdev_config_find_by_name(raid_name); + if (raid_cfg != NULL) { + SPDK_ERRLOG("Duplicate raid bdev name found in config file %s\n", + raid_name); + return -EEXIST; + } + + if (spdk_u32_is_pow2(strip_size) == false) { + SPDK_ERRLOG("Invalid strip size %d\n", strip_size); + return -EINVAL; + } + + if (num_base_bdevs <= 0) { + SPDK_ERRLOG("Invalid base device count %d\n", num_base_bdevs); + return -EINVAL; + } + + if (raid_level != 0) { + SPDK_ERRLOG("invalid raid level %d, only raid level 0 is supported\n", + raid_level); + return -EINVAL; + } + + raid_cfg = calloc(1, sizeof(*raid_cfg)); + if (raid_cfg == NULL) { + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + raid_cfg->name = strdup(raid_name); + if (!raid_cfg->name) { + free(raid_cfg); + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + raid_cfg->strip_size = strip_size; + raid_cfg->num_base_bdevs = num_base_bdevs; + raid_cfg->raid_level = raid_level; + + raid_cfg->base_bdev = calloc(num_base_bdevs, sizeof(*raid_cfg->base_bdev)); + if (raid_cfg->base_bdev == NULL) { + free(raid_cfg->name); + free(raid_cfg); + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&g_spdk_raid_config.raid_bdev_config_head, raid_cfg, link); + g_spdk_raid_config.total_raid_bdev++; + + *_raid_cfg = raid_cfg; + return 0; +} + +/* + * brief: + * raid_bdev_config_add_base_bdev function add base bdev to raid bdev config. + * + * params: + * raid_cfg - pointer to raid bdev configuration + * base_bdev_name - name of base bdev + * slot - Position to add base bdev + */ +int +raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, const char *base_bdev_name, + uint32_t slot) +{ + uint32_t i; + struct raid_bdev_config *tmp; + + if (slot >= raid_cfg->num_base_bdevs) { + return -EINVAL; + } + + TAILQ_FOREACH(tmp, &g_spdk_raid_config.raid_bdev_config_head, link) { + for (i = 0; i < tmp->num_base_bdevs; i++) { + if (tmp->base_bdev[i].name != NULL) { + if (!strcmp(tmp->base_bdev[i].name, base_bdev_name)) { + SPDK_ERRLOG("duplicate base bdev name %s mentioned\n", + base_bdev_name); + return -EEXIST; + } + } + } + } + + raid_cfg->base_bdev[slot].name = strdup(base_bdev_name); + if (raid_cfg->base_bdev[slot].name == NULL) { + SPDK_ERRLOG("unable to allocate memory\n"); + return -ENOMEM; + } + + return 0; +} +/* + * brief: + * raid_bdev_parse_raid is used to parse the raid bdev from config file based on + * pre-defined raid bdev format in config file. + * Format of config file: + * [RAID1] + * Name raid1 + * StripSize 64 + * NumDevices 2 + * RaidLevel 0 + * Devices Nvme0n1 Nvme1n1 + * + * [RAID2] + * Name raid2 + * StripSize 64 + * NumDevices 3 + * RaidLevel 0 + * Devices Nvme2n1 Nvme3n1 Nvme4n1 + * + * params: + * conf_section - pointer to config section + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_parse_raid(struct spdk_conf_section *conf_section) +{ + const char *raid_name; + int strip_size; + int i, num_base_bdevs; + int raid_level; + const char *base_bdev_name; + struct raid_bdev_config *raid_cfg; + int rc; + + raid_name = spdk_conf_section_get_val(conf_section, "Name"); + if (raid_name == NULL) { + SPDK_ERRLOG("raid_name %s is null\n", raid_name); + return -EINVAL; + } + + strip_size = spdk_conf_section_get_intval(conf_section, "StripSize"); + num_base_bdevs = spdk_conf_section_get_intval(conf_section, "NumDevices"); + raid_level = spdk_conf_section_get_intval(conf_section, "RaidLevel"); + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "%s %d %d %d\n", raid_name, strip_size, num_base_bdevs, + raid_level); + + rc = raid_bdev_config_add(raid_name, strip_size, num_base_bdevs, raid_level, + &raid_cfg); + if (rc != 0) { + SPDK_ERRLOG("Failed to add raid bdev config\n"); + return rc; + } + + for (i = 0; true; i++) { + base_bdev_name = spdk_conf_section_get_nmval(conf_section, "Devices", 0, i); + if (base_bdev_name == NULL) { + break; + } + if (i >= num_base_bdevs) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Number of devices mentioned is more than count\n"); + return -EINVAL; + } + + rc = raid_bdev_config_add_base_bdev(raid_cfg, base_bdev_name, i); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Failed to add base bdev to raid bdev config\n"); + return rc; + } + } + + if (i != raid_cfg->num_base_bdevs) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Number of devices mentioned is less than count\n"); + return -EINVAL; + } + + rc = raid_bdev_create(raid_cfg); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + SPDK_ERRLOG("Failed to create raid bdev\n"); + return rc; + } + + rc = raid_bdev_add_base_devices(raid_cfg); + if (rc != 0) { + SPDK_ERRLOG("Failed to add any base bdev to raid bdev\n"); + /* Config is not removed in this case. */ + } + + return 0; +} + +/* + * brief: + * raid_bdev_parse_config is used to find the raid bdev config section and parse it + * Format of config file: + * params: + * none + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_parse_config(void) +{ + int ret; + struct spdk_conf_section *conf_section; + + conf_section = spdk_conf_first_section(NULL); + while (conf_section != NULL) { + if (spdk_conf_section_match_prefix(conf_section, "RAID")) { + ret = raid_bdev_parse_raid(conf_section); + if (ret < 0) { + SPDK_ERRLOG("Unable to parse raid bdev section\n"); + return ret; + } + } + conf_section = spdk_conf_next_section(conf_section); + } + + return 0; +} + +/* + * brief: + * raid_bdev_fini_start is called when bdev layer is starting the + * shutdown process + * params: + * none + * returns: + * none + */ +static void +raid_bdev_fini_start(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_fini_start\n"); + g_shutdown_started = true; +} + +/* + * brief: + * raid_bdev_exit is called on raid bdev module exit time by bdev layer + * params: + * none + * returns: + * none + */ +static void +raid_bdev_exit(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_exit\n"); + raid_bdev_free(); +} + +/* + * brief: + * raid_bdev_get_ctx_size is used to return the context size of bdev_io for raid + * module + * params: + * none + * returns: + * size of spdk_bdev_io context for raid + */ +static int +raid_bdev_get_ctx_size(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_get_ctx_size\n"); + return sizeof(struct raid_bdev_io); +} + +/* + * brief: + * raid_bdev_get_running_config is used to get the configuration options. + * + * params: + * fp - The pointer to a file that will be written to the configuration options. + * returns: + * none + */ +static void +raid_bdev_get_running_config(FILE *fp) +{ + struct raid_bdev *raid_bdev; + struct spdk_bdev *base; + int index = 1; + uint16_t i; + + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_configured_list, state_link) { + fprintf(fp, + "\n" + "[RAID%d]\n" + " Name %s\n" + " StripSize %" PRIu32 "\n" + " NumDevices %hu\n" + " RaidLevel %hhu\n", + index, raid_bdev->bdev.name, raid_bdev->strip_size, + raid_bdev->num_base_bdevs, raid_bdev->raid_level); + fprintf(fp, + " Devices "); + for (i = 0; i < raid_bdev->num_base_bdevs; i++) { + base = raid_bdev->base_bdev_info[i].bdev; + if (base) { + fprintf(fp, + "%s ", + base->name); + } + } + fprintf(fp, + "\n"); + index++; + } +} + +/* + * brief: + * raid_bdev_can_claim_bdev is the function to check if this base_bdev can be + * claimed by raid bdev or not. + * params: + * bdev_name - represents base bdev name + * _raid_cfg - pointer to raid bdev config parsed from config file + * base_bdev_slot - if bdev can be claimed, it represents the base_bdev correct + * slot. This field is only valid if return value of this function is true + * returns: + * true - if bdev can be claimed + * false - if bdev can't be claimed + */ +static bool +raid_bdev_can_claim_bdev(const char *bdev_name, struct raid_bdev_config **_raid_cfg, + uint32_t *base_bdev_slot) +{ + struct raid_bdev_config *raid_cfg; + uint32_t i; + + TAILQ_FOREACH(raid_cfg, &g_spdk_raid_config.raid_bdev_config_head, link) { + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + /* + * Check if the base bdev name is part of raid bdev configuration. + * If match is found then return true and the slot information where + * this base bdev should be inserted in raid bdev + */ + if (!strcmp(bdev_name, raid_cfg->base_bdev[i].name)) { + *_raid_cfg = raid_cfg; + *base_bdev_slot = i; + return true; + } + } + } + + return false; +} + + +static struct spdk_bdev_module g_raid_if = { + .name = "raid", + .module_init = raid_bdev_init, + .fini_start = raid_bdev_fini_start, + .module_fini = raid_bdev_exit, + .get_ctx_size = raid_bdev_get_ctx_size, + .examine_config = raid_bdev_examine, + .config_text = raid_bdev_get_running_config, + .async_init = false, + .async_fini = false, +}; +SPDK_BDEV_MODULE_REGISTER(&g_raid_if) + +/* + * brief: + * raid_bdev_init is the initialization function for raid bdev module + * params: + * none + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_init(void) +{ + int ret; + + TAILQ_INIT(&g_spdk_raid_bdev_configured_list); + TAILQ_INIT(&g_spdk_raid_bdev_configuring_list); + TAILQ_INIT(&g_spdk_raid_bdev_list); + TAILQ_INIT(&g_spdk_raid_bdev_offline_list); + + /* Parse config file for raids */ + ret = raid_bdev_parse_config(); + if (ret < 0) { + SPDK_ERRLOG("raid bdev init failed parsing\n"); + raid_bdev_free(); + return ret; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_init completed successfully\n"); + + return 0; +} + +/* + * brief: + * raid_bdev_create allocates raid bdev based on passed configuration + * params: + * raid_cfg - configuration of raid bdev + * returns: + * 0 - success + * non zero - failure + */ +int +raid_bdev_create(struct raid_bdev_config *raid_cfg) +{ + struct raid_bdev *raid_bdev; + struct spdk_bdev *raid_bdev_gen; + + raid_bdev = calloc(1, sizeof(*raid_bdev)); + if (!raid_bdev) { + SPDK_ERRLOG("Unable to allocate memory for raid bdev\n"); + return -ENOMEM; + } + + assert(raid_cfg->num_base_bdevs != 0); + raid_bdev->num_base_bdevs = raid_cfg->num_base_bdevs; + raid_bdev->base_bdev_info = calloc(raid_bdev->num_base_bdevs, + sizeof(struct raid_base_bdev_info)); + if (!raid_bdev->base_bdev_info) { + SPDK_ERRLOG("Unable able to allocate base bdev info\n"); + free(raid_bdev); + return -ENOMEM; + } + + raid_bdev->strip_size = raid_cfg->strip_size; + raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; + raid_bdev->config = raid_cfg; + + raid_bdev_gen = &raid_bdev->bdev; + + raid_bdev_gen->name = strdup(raid_cfg->name); + if (!raid_bdev_gen->name) { + SPDK_ERRLOG("Unable to allocate name for raid\n"); + free(raid_bdev->base_bdev_info); + free(raid_bdev); + return -ENOMEM; + } + + raid_bdev_gen->product_name = "Pooled Device"; + raid_bdev_gen->ctxt = raid_bdev; + raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; + raid_bdev_gen->module = &g_raid_if; + + TAILQ_INSERT_TAIL(&g_spdk_raid_bdev_configuring_list, raid_bdev, state_link); + TAILQ_INSERT_TAIL(&g_spdk_raid_bdev_list, raid_bdev, global_link); + + raid_cfg->raid_bdev = raid_bdev; + + return 0; +} + +/* + * brief + * raid_bdev_alloc_base_bdev_resource allocates resource of base bdev. + * params: + * raid_bdev - pointer to raid bdev + * bdev - pointer to base bdev + * base_bdev_slot - position to add base bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_alloc_base_bdev_resource(struct raid_bdev *raid_bdev, struct spdk_bdev *bdev, + uint32_t base_bdev_slot) +{ + struct spdk_bdev_desc *desc; + int rc; + + rc = spdk_bdev_open(bdev, true, raid_bdev_remove_base_bdev, bdev, &desc); + if (rc != 0) { + SPDK_ERRLOG("Unable to create desc on bdev '%s'\n", bdev->name); + return rc; + } + + rc = spdk_bdev_module_claim_bdev(bdev, NULL, &g_raid_if); + if (rc != 0) { + SPDK_ERRLOG("Unable to claim this bdev as it is already claimed\n"); + spdk_bdev_close(desc); + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s is claimed\n", bdev->name); + + assert(raid_bdev->state != RAID_BDEV_STATE_ONLINE); + assert(base_bdev_slot < raid_bdev->num_base_bdevs); + + raid_bdev->base_bdev_info[base_bdev_slot].bdev = bdev; + raid_bdev->base_bdev_info[base_bdev_slot].desc = desc; + raid_bdev->num_base_bdevs_discovered++; + assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); + + return 0; +} + +/* + * brief: + * If raid bdev config is complete, then only register the raid bdev to + * bdev layer and remove this raid bdev from configuring list and + * insert the raid bdev to configured list + * params: + * raid_bdev - pointer to raid bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_configure(struct raid_bdev *raid_bdev) +{ + uint32_t blocklen; + uint64_t min_blockcnt; + struct spdk_bdev *raid_bdev_gen; + int rc = 0; + + blocklen = raid_bdev->base_bdev_info[0].bdev->blocklen; + min_blockcnt = raid_bdev->base_bdev_info[0].bdev->blockcnt; + for (uint32_t i = 1; i < raid_bdev->num_base_bdevs; i++) { + /* Calculate minimum block count from all base bdevs */ + if (raid_bdev->base_bdev_info[i].bdev->blockcnt < min_blockcnt) { + min_blockcnt = raid_bdev->base_bdev_info[i].bdev->blockcnt; + } + + /* Check blocklen for all base bdevs that it should be same */ + if (blocklen != raid_bdev->base_bdev_info[i].bdev->blocklen) { + /* + * Assumption is that all the base bdevs for any raid bdev should + * have same blocklen + */ + SPDK_ERRLOG("Blocklen of various bdevs not matching\n"); + return -EINVAL; + } + } + + raid_bdev_gen = &raid_bdev->bdev; + raid_bdev_gen->write_cache = 0; + raid_bdev_gen->blocklen = blocklen; + raid_bdev_gen->ctxt = raid_bdev; + raid_bdev_gen->fn_table = &g_raid_bdev_fn_table; + raid_bdev_gen->module = &g_raid_if; + raid_bdev->strip_size = (raid_bdev->strip_size * 1024) / blocklen; + raid_bdev->strip_size_shift = spdk_u32log2(raid_bdev->strip_size); + raid_bdev->blocklen_shift = spdk_u32log2(blocklen); + if (raid_bdev->num_base_bdevs > 1) { + raid_bdev_gen->optimal_io_boundary = raid_bdev->strip_size; + raid_bdev_gen->split_on_optimal_io_boundary = true; + } else { + /* Do not need to split reads/writes on single bdev RAID modules. */ + raid_bdev_gen->optimal_io_boundary = 0; + raid_bdev_gen->split_on_optimal_io_boundary = false; + } + + /* + * RAID bdev logic is for striping so take the minimum block count based + * approach where total block count of raid bdev is the number of base + * bdev times the minimum block count of any base bdev + */ + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "min blockcount %lu, numbasedev %u, strip size shift %u\n", + min_blockcnt, + raid_bdev->num_base_bdevs, raid_bdev->strip_size_shift); + raid_bdev_gen->blockcnt = ((min_blockcnt >> raid_bdev->strip_size_shift) << + raid_bdev->strip_size_shift) * raid_bdev->num_base_bdevs; + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "io device register %p\n", raid_bdev); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "blockcnt %lu, blocklen %u\n", raid_bdev_gen->blockcnt, + raid_bdev_gen->blocklen); + if (raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) { + raid_bdev->state = RAID_BDEV_STATE_ONLINE; + spdk_io_device_register(raid_bdev, raid_bdev_create_cb, raid_bdev_destroy_cb, + sizeof(struct raid_bdev_io_channel), + raid_bdev->bdev.name); + rc = spdk_bdev_register(raid_bdev_gen); + if (rc != 0) { + SPDK_ERRLOG("Unable to register pooled bdev and stay at configuring state\n"); + spdk_io_device_unregister(raid_bdev, NULL); + raid_bdev->state = RAID_BDEV_STATE_CONFIGURING; + return rc; + } + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev generic %p\n", raid_bdev_gen); + TAILQ_REMOVE(&g_spdk_raid_bdev_configuring_list, raid_bdev, state_link); + TAILQ_INSERT_TAIL(&g_spdk_raid_bdev_configured_list, raid_bdev, state_link); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev is created with name %s, raid_bdev %p\n", + raid_bdev_gen->name, raid_bdev); + } + + return 0; +} + +/* + * brief: + * If raid bdev is online and registered, change the bdev state to + * configuring and unregister this raid device. Queue this raid device + * in configuring list + * params: + * raid_bdev - pointer to raid bdev + * returns: + * none + */ +static void +raid_bdev_deconfigure(struct raid_bdev *raid_bdev) +{ + if (raid_bdev->state != RAID_BDEV_STATE_ONLINE) { + return; + } + + assert(raid_bdev->num_base_bdevs == raid_bdev->num_base_bdevs_discovered); + TAILQ_REMOVE(&g_spdk_raid_bdev_configured_list, raid_bdev, state_link); + raid_bdev->state = RAID_BDEV_STATE_OFFLINE; + assert(raid_bdev->num_base_bdevs_discovered); + TAILQ_INSERT_TAIL(&g_spdk_raid_bdev_offline_list, raid_bdev, state_link); + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid bdev state chaning from online to offline\n"); + + spdk_io_device_unregister(raid_bdev, NULL); + spdk_bdev_unregister(&raid_bdev->bdev, NULL, NULL); +} + +/* + * brief: + * raid_bdev_remove_base_bdev function is called by below layers when base_bdev + * is removed. This function checks if this base bdev is part of any raid bdev + * or not. If yes, it takes necessary action on that particular raid bdev. + * params: + * ctx - pointer to base bdev pointer which got removed + * returns: + * none + */ +void +raid_bdev_remove_base_bdev(void *ctx) +{ + struct spdk_bdev *base_bdev = ctx; + struct raid_bdev *raid_bdev; + uint16_t i; + bool found = false; + + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "raid_bdev_remove_base_bdev\n"); + + /* Find the raid_bdev which has claimed this base_bdev */ + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_list, global_link) { + for (i = 0; i < raid_bdev->num_base_bdevs; i++) { + if (raid_bdev->base_bdev_info[i].bdev == base_bdev) { + found = true; + break; + } + } + if (found == true) { + break; + } + } + + if (found == false) { + SPDK_ERRLOG("bdev to remove '%s' not found\n", base_bdev->name); + return; + } + + assert(raid_bdev != NULL); + assert(raid_bdev->base_bdev_info[i].bdev); + assert(raid_bdev->base_bdev_info[i].desc); + raid_bdev->base_bdev_info[i].remove_scheduled = true; + + if ((raid_bdev->destruct_called == true || + raid_bdev->state == RAID_BDEV_STATE_CONFIGURING) && + raid_bdev->base_bdev_info[i].bdev != NULL) { + /* + * As raid bdev is not registered yet or already unregistered, so cleanup + * should be done here itself + */ + raid_bdev_free_base_bdev_resource(raid_bdev, i); + if (raid_bdev->num_base_bdevs_discovered == 0) { + /* Since there is no base bdev for this raid, so free the raid device */ + raid_bdev_cleanup(raid_bdev); + return; + } + } + + raid_bdev_deconfigure(raid_bdev); +} + +/* + * brief: + * raid_bdev_add_base_device function is the actual function which either adds + * the nvme base device to existing raid bdev or create a new raid bdev. It also claims + * the base device and keep the open descriptor. + * params: + * raid_cfg - pointer to raid bdev config + * bdev - pointer to base bdev + * base_bdev_slot - position to add base bdev + * returns: + * 0 - success + * non zero - failure + */ +static int +raid_bdev_add_base_device(struct raid_bdev_config *raid_cfg, struct spdk_bdev *bdev, + uint32_t base_bdev_slot) +{ + struct raid_bdev *raid_bdev; + int rc; + + raid_bdev = raid_cfg->raid_bdev; + if (!raid_bdev) { + SPDK_ERRLOG("Raid bdev is not created yet '%s'\n", bdev->name); + return -ENODEV; + } + + rc = raid_bdev_alloc_base_bdev_resource(raid_bdev, bdev, base_bdev_slot); + if (rc != 0) { + SPDK_ERRLOG("Failed to allocate resource for bdev '%s'\n", bdev->name); + return rc; + } + + assert(raid_bdev->num_base_bdevs_discovered <= raid_bdev->num_base_bdevs); + + if (raid_bdev->num_base_bdevs_discovered == raid_bdev->num_base_bdevs) { + rc = raid_bdev_configure(raid_bdev); + if (rc != 0) { + SPDK_ERRLOG("Failed to configure raid bdev\n"); + return rc; + } + } + + return 0; +} + +/* + * brief: + * Add base bdevs to the raid bdev one by one. Skip any base bdev which doesn't + * exist or fails to add. If all base bdevs are successfully added, the raid bdev + * moves to the configured state and becomes available. Otherwise, the raid bdev + * stays at the configuring state with added base bdevs. + * params: + * raid_cfg - pointer to raid bdev config + * returns: + * 0 - The raid bdev moves to the configured state or stays at the configuring + * state with added base bdevs due to any nonexistent base bdev. + * non zero - Failed to add any base bdev and stays at the configuring state with + * added base bdevs. + */ +int +raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg) +{ + struct spdk_bdev *base_bdev; + uint8_t i; + int rc = 0, _rc; + + for (i = 0; i < raid_cfg->num_base_bdevs; i++) { + base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name); + if (base_bdev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "base bdev %s doesn't exist now\n", + raid_cfg->base_bdev[i].name); + continue; + } + + _rc = raid_bdev_add_base_device(raid_cfg, base_bdev, i); + if (_rc != 0) { + SPDK_ERRLOG("Failed to add base bdev %s to RAID bdev %s: %s\n", + raid_cfg->base_bdev[i].name, raid_cfg->name, + spdk_strerror(-_rc)); + if (rc == 0) { + rc = _rc; + } + } + } + + return rc; +} + +/* + * brief: + * raid_bdev_examine function is the examine function call by the below layers + * like bdev_nvme layer. This function will check if this base bdev can be + * claimed by this raid bdev or not. + * params: + * bdev - pointer to base bdev + * returns: + * none + */ +static void +raid_bdev_examine(struct spdk_bdev *bdev) +{ + struct raid_bdev_config *raid_cfg; + uint32_t base_bdev_slot; + + if (raid_bdev_can_claim_bdev(bdev->name, &raid_cfg, &base_bdev_slot)) { + raid_bdev_add_base_device(raid_cfg, bdev, base_bdev_slot); + } else { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RAID, "bdev %s can't be claimed\n", + bdev->name); + } + + spdk_bdev_module_examine_done(&g_raid_if); +} + +/* Log component for bdev raid bdev module */ +SPDK_LOG_REGISTER_COMPONENT("bdev_raid", SPDK_LOG_BDEV_RAID) diff --git a/src/spdk/lib/bdev/raid/bdev_raid.h b/src/spdk/lib/bdev/raid/bdev_raid.h new file mode 100644 index 00000000..39f055ed --- /dev/null +++ b/src/spdk/lib/bdev/raid/bdev_raid.h @@ -0,0 +1,225 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_RAID_INTERNAL_H +#define SPDK_BDEV_RAID_INTERNAL_H + +#include "spdk/bdev_module.h" + +/* + * Raid state describes the state of the raid. This raid bdev can be either in + * configured list or configuring list + */ +enum raid_bdev_state { + /* raid bdev is ready and is seen by upper layers */ + RAID_BDEV_STATE_ONLINE, + + /* + * raid bdev is configuring, not all underlying bdevs are present. + * And can't be seen by upper layers. + */ + RAID_BDEV_STATE_CONFIGURING, + + /* + * In offline state, raid bdev layer will complete all incoming commands without + * submitting to underlying base nvme bdevs + */ + RAID_BDEV_STATE_OFFLINE, + + /* raid bdev max, new states should be added before this */ + RAID_BDEV_MAX +}; + +/* + * raid_base_bdev_info contains information for the base bdevs which are part of some + * raid. This structure contains the per base bdev information. Whatever is + * required per base device for raid bdev will be kept here + */ +struct raid_base_bdev_info { + /* pointer to base spdk bdev */ + struct spdk_bdev *bdev; + + /* pointer to base bdev descriptor opened by raid bdev */ + struct spdk_bdev_desc *desc; + + /* + * When underlying base device calls the hot plug function on drive removal, + * this flag will be set and later after doing some processing, base device + * descriptor will be closed + */ + bool remove_scheduled; +}; + +/* + * raid_bdev is the single entity structure which contains SPDK block device + * and the information related to any raid bdev either configured or + * in configuring list. io device is created on this. + */ +struct raid_bdev { + /* raid bdev device, this will get registered in bdev layer */ + struct spdk_bdev bdev; + + /* link of raid bdev to link it to configured, configuring or offline list */ + TAILQ_ENTRY(raid_bdev) state_link; + + /* link of raid bdev to link it to global raid bdev list */ + TAILQ_ENTRY(raid_bdev) global_link; + + /* pointer to config file entry */ + struct raid_bdev_config *config; + + /* array of base bdev info */ + struct raid_base_bdev_info *base_bdev_info; + + /* strip size of raid bdev in blocks */ + uint32_t strip_size; + + /* strip size bit shift for optimized calculation */ + uint32_t strip_size_shift; + + /* block length bit shift for optimized calculation */ + uint32_t blocklen_shift; + + /* state of raid bdev */ + enum raid_bdev_state state; + + /* number of base bdevs comprising raid bdev */ + uint16_t num_base_bdevs; + + /* number of base bdevs discovered */ + uint16_t num_base_bdevs_discovered; + + /* Raid Level of this raid bdev */ + uint8_t raid_level; + + /* Set to true if destruct is called for this raid bdev */ + bool destruct_called; +}; + +/* + * raid_bdev_io is the context part of bdev_io. It contains the information + * related to bdev_io for a pooled bdev + */ +struct raid_bdev_io { + /* WaitQ entry, used only in waitq logic */ + struct spdk_bdev_io_wait_entry waitq_entry; + + /* Original channel for this IO, used in queuing logic */ + struct spdk_io_channel *ch; + + /* Used for tracking progress on resets sent to member disks. */ + uint8_t base_bdev_reset_submitted; + uint8_t base_bdev_reset_completed; + uint8_t base_bdev_reset_status; +}; + +/* + * raid_base_bdev_config is the per base bdev data structure which contains + * information w.r.t to per base bdev during parsing config + */ +struct raid_base_bdev_config { + /* base bdev name from config file */ + char *name; +}; + +/* + * raid_bdev_config contains the raid bdev config related information after + * parsing the config file + */ +struct raid_bdev_config { + /* base bdev config per underlying bdev */ + struct raid_base_bdev_config *base_bdev; + + /* Points to already created raid bdev */ + struct raid_bdev *raid_bdev; + + char *name; + + /* strip size of this raid bdev in kilo bytes */ + uint32_t strip_size; + + /* number of base bdevs */ + uint8_t num_base_bdevs; + + /* raid level */ + uint8_t raid_level; + + TAILQ_ENTRY(raid_bdev_config) link; +}; + +/* + * raid_config is the top level structure representing the raid bdev config as read + * from config file for all raids + */ +struct raid_config { + /* raid bdev context from config file */ + TAILQ_HEAD(, raid_bdev_config) raid_bdev_config_head; + + /* total raid bdev from config file */ + uint8_t total_raid_bdev; +}; + +/* + * raid_bdev_io_channel is the context of spdk_io_channel for raid bdev device. It + * contains the relationship of raid bdev io channel with base bdev io channels. + */ +struct raid_bdev_io_channel { + /* Array of IO channels of base bdevs */ + struct spdk_io_channel **base_channel; +}; + +/* TAIL heads for various raid bdev lists */ +TAILQ_HEAD(spdk_raid_configured_tailq, raid_bdev); +TAILQ_HEAD(spdk_raid_configuring_tailq, raid_bdev); +TAILQ_HEAD(spdk_raid_all_tailq, raid_bdev); +TAILQ_HEAD(spdk_raid_offline_tailq, raid_bdev); + +extern struct spdk_raid_configured_tailq g_spdk_raid_bdev_configured_list; +extern struct spdk_raid_configuring_tailq g_spdk_raid_bdev_configuring_list; +extern struct spdk_raid_all_tailq g_spdk_raid_bdev_list; +extern struct spdk_raid_offline_tailq g_spdk_raid_bdev_offline_list; +extern struct raid_config g_spdk_raid_config; + +int raid_bdev_create(struct raid_bdev_config *raid_cfg); +void raid_bdev_remove_base_bdev(void *ctx); +int raid_bdev_add_base_devices(struct raid_bdev_config *raid_cfg); +void raid_bdev_free_base_bdev_resource(struct raid_bdev *raid_bdev, uint32_t slot); +void raid_bdev_cleanup(struct raid_bdev *raid_bdev); +int raid_bdev_config_add(const char *raid_name, int strip_size, int num_base_bdevs, + int raid_level, struct raid_bdev_config **_raid_cfg); +int raid_bdev_config_add_base_bdev(struct raid_bdev_config *raid_cfg, + const char *base_bdev_name, uint32_t slot); +void raid_bdev_config_cleanup(struct raid_bdev_config *raid_cfg); +struct raid_bdev_config *raid_bdev_config_find_by_name(const char *raid_name); + +#endif // SPDK_BDEV_RAID_INTERNAL_H diff --git a/src/spdk/lib/bdev/raid/bdev_raid_rpc.c b/src/spdk/lib/bdev/raid/bdev_raid_rpc.c new file mode 100644 index 00000000..00b3bc9d --- /dev/null +++ b/src/spdk/lib/bdev/raid/bdev_raid_rpc.c @@ -0,0 +1,408 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/bdev.h" +#include "bdev_raid.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" +#include "spdk/env.h" + +#define RPC_MAX_BASE_BDEVS 255 + +SPDK_LOG_REGISTER_COMPONENT("raidrpc", SPDK_LOG_RAID_RPC) + +/* + * Input structure for get_raid_bdevs RPC + */ +struct rpc_get_raid_bdevs { + /* category - all or online or configuring or offline */ + char *category; +}; + +/* + * brief: + * free_rpc_get_raids function frees RPC get_raids related parameters + * params: + * req - pointer to RPC request + * returns: + * none + */ +static void +free_rpc_get_raid_bdevs(struct rpc_get_raid_bdevs *req) +{ + free(req->category); +} + +/* + * Decoder object for RPC get_raids + */ +static const struct spdk_json_object_decoder rpc_get_raid_bdevs_decoders[] = { + {"category", offsetof(struct rpc_get_raid_bdevs, category), spdk_json_decode_string}, +}; + +/* + * brief: + * spdk_rpc_get_raids function is the RPC for get_raids. This is used to list + * all the raid bdev names based on the input category requested. Category should be + * one of "all", "online", "configuring" or "offline". "all" means all the raids + * whether they are online or configuring or offline. "online" is the raid bdev which + * is registered with bdev layer. "configuring" is the raid bdev which does not have + * full configuration discovered yet. "offline" is the raid bdev which is not + * registered with bdev as of now and it has encountered any error or user has + * requested to offline the raid. + * params: + * requuest - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +spdk_rpc_get_raid_bdevs(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct rpc_get_raid_bdevs req = {}; + struct spdk_json_write_ctx *w; + struct raid_bdev *raid_bdev; + + if (spdk_json_decode_object(params, rpc_get_raid_bdevs_decoders, + SPDK_COUNTOF(rpc_get_raid_bdevs_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_get_raid_bdevs(&req); + return; + } + + if (!(strcmp(req.category, "all") == 0 || + strcmp(req.category, "online") == 0 || + strcmp(req.category, "configuring") == 0 || + strcmp(req.category, "offline") == 0)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_get_raid_bdevs(&req); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_get_raid_bdevs(&req); + return; + } + + spdk_json_write_array_begin(w); + + /* Get raid bdev list based on the category requested */ + if (strcmp(req.category, "all") == 0) { + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_list, global_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else if (strcmp(req.category, "online") == 0) { + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_configured_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else if (strcmp(req.category, "configuring") == 0) { + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_configuring_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } else { + TAILQ_FOREACH(raid_bdev, &g_spdk_raid_bdev_offline_list, state_link) { + spdk_json_write_string(w, raid_bdev->bdev.name); + } + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free_rpc_get_raid_bdevs(&req); +} +SPDK_RPC_REGISTER("get_raid_bdevs", spdk_rpc_get_raid_bdevs, SPDK_RPC_RUNTIME) + +/* + * Base bdevs in RPC construct_raid + */ +struct rpc_construct_raid_base_bdevs { + /* Number of base bdevs */ + size_t num_base_bdevs; + + /* List of base bdevs names */ + char *base_bdevs[RPC_MAX_BASE_BDEVS]; +}; + +/* + * Input structure for RPC construct_raid + */ +struct rpc_construct_raid_bdev { + /* Raid bdev name */ + char *name; + + /* RAID strip size */ + uint32_t strip_size; + + /* RAID raid level */ + uint8_t raid_level; + + /* Base bdevs information */ + struct rpc_construct_raid_base_bdevs base_bdevs; +}; + +/* + * brief: + * free_rpc_construct_raid_bdev function is to free RPC construct_raid_bdev related parameters + * params: + * req - pointer to RPC request + * returns: + * none + */ +static void +free_rpc_construct_raid_bdev(struct rpc_construct_raid_bdev *req) +{ + free(req->name); + for (size_t i = 0; i < req->base_bdevs.num_base_bdevs; i++) { + free(req->base_bdevs.base_bdevs[i]); + } +} + +/* + * Decoder function for RPC construct_raid_bdev to decode base bdevs list + */ +static int +decode_base_bdevs(const struct spdk_json_val *val, void *out) +{ + struct rpc_construct_raid_base_bdevs *base_bdevs = out; + return spdk_json_decode_array(val, spdk_json_decode_string, base_bdevs->base_bdevs, + RPC_MAX_BASE_BDEVS, &base_bdevs->num_base_bdevs, sizeof(char *)); +} + +/* + * Decoder object for RPC construct_raid + */ +static const struct spdk_json_object_decoder rpc_construct_raid_bdev_decoders[] = { + {"name", offsetof(struct rpc_construct_raid_bdev, name), spdk_json_decode_string}, + {"strip_size", offsetof(struct rpc_construct_raid_bdev, strip_size), spdk_json_decode_uint32}, + {"raid_level", offsetof(struct rpc_construct_raid_bdev, raid_level), spdk_json_decode_uint32}, + {"base_bdevs", offsetof(struct rpc_construct_raid_bdev, base_bdevs), decode_base_bdevs}, +}; + +/* + * brief: + * spdk_rpc_construct_raid_bdev function is the RPC for construct_raids. It takes + * input as raid bdev name, raid level, strip size in KB and list of base bdev names. + * params: + * requuest - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +spdk_rpc_construct_raid_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_raid_bdev req = {}; + struct spdk_json_write_ctx *w; + struct raid_bdev_config *raid_cfg; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_raid_bdev_decoders, + SPDK_COUNTOF(rpc_construct_raid_bdev_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_construct_raid_bdev(&req); + return; + } + + rc = raid_bdev_config_add(req.name, req.strip_size, req.base_bdevs.num_base_bdevs, req.raid_level, + &raid_cfg); + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to add RAID bdev config %s: %s", + req.name, spdk_strerror(-rc)); + free_rpc_construct_raid_bdev(&req); + return; + } + + for (size_t i = 0; i < req.base_bdevs.num_base_bdevs; i++) { + rc = raid_bdev_config_add_base_bdev(raid_cfg, req.base_bdevs.base_bdevs[i], i); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to add base bdev %s to RAID bdev config %s: %s", + req.base_bdevs.base_bdevs[i], req.name, + spdk_strerror(-rc)); + free_rpc_construct_raid_bdev(&req); + return; + } + } + + rc = raid_bdev_create(raid_cfg); + if (rc != 0) { + raid_bdev_config_cleanup(raid_cfg); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to create RAID bdev %s: %s", + req.name, spdk_strerror(-rc)); + free_rpc_construct_raid_bdev(&req); + return; + } + + rc = raid_bdev_add_base_devices(raid_cfg); + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to add any base bdev to RAID bdev %s: %s", + req.name, spdk_strerror(-rc)); + free_rpc_construct_raid_bdev(&req); + return; + } + + free_rpc_construct_raid_bdev(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("construct_raid_bdev", spdk_rpc_construct_raid_bdev, SPDK_RPC_RUNTIME) + +/* + * Input structure for RPC destroy_raid + */ +struct rpc_destroy_raid_bdev { + /* raid bdev name */ + char *name; +}; + +/* + * brief: + * free_rpc_destroy_raid_bdev function is used to free RPC destroy_raid_bdev related parameters + * params: + * req - pointer to RPC request + * params: + * none + */ +static void +free_rpc_destroy_raid_bdev(struct rpc_destroy_raid_bdev *req) +{ + free(req->name); +} + +/* + * Decoder object for RPC destroy_raid + */ +static const struct spdk_json_object_decoder rpc_destroy_raid_bdev_decoders[] = { + {"name", offsetof(struct rpc_destroy_raid_bdev, name), spdk_json_decode_string}, +}; + +/* + * brief: + * Since destroying raid_bdev is asynchronous operation, so this function is + * used to check if raid bdev still exists. If raid bdev is still there it will create + * event and check later, otherwise it will proceed with cleanup + * params: + * arg - pointer to raid bdev cfg + * returns: + * none + */ +static void +raid_bdev_config_destroy(void *arg) +{ + struct raid_bdev_config *raid_cfg = arg; + + assert(raid_cfg != NULL); + if (raid_cfg->raid_bdev != NULL) { + /* + * If raid bdev exists for this config, wait for raid bdev to get + * destroyed and come back later + */ + spdk_thread_send_msg(spdk_get_thread(), raid_bdev_config_destroy, + raid_cfg); + } else { + raid_bdev_config_cleanup(raid_cfg); + } +} + +/* + * brief: + * spdk_rpc_destroy_raid_bdev function is the RPC for destroy_raid. It takes raid + * name as input and destroy that raid bdev including freeing the base bdev + * resources. + * params: + * requuest - pointer to json rpc request + * params - pointer to request parameters + * returns: + * none + */ +static void +spdk_rpc_destroy_raid_bdev(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct rpc_destroy_raid_bdev req = {}; + struct spdk_json_write_ctx *w; + struct raid_bdev_config *raid_cfg = NULL; + struct spdk_bdev *base_bdev; + + if (spdk_json_decode_object(params, rpc_destroy_raid_bdev_decoders, + SPDK_COUNTOF(rpc_destroy_raid_bdev_decoders), + &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_destroy_raid_bdev(&req); + return; + } + + raid_cfg = raid_bdev_config_find_by_name(req.name); + if (raid_cfg == NULL) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "raid bdev %s is not found in config", req.name); + free_rpc_destroy_raid_bdev(&req); + return; + } + + /* Remove all the base bdevs from this raid bdev before destroying the raid bdev */ + for (uint32_t i = 0; i < raid_cfg->num_base_bdevs; i++) { + base_bdev = spdk_bdev_get_by_name(raid_cfg->base_bdev[i].name); + if (base_bdev != NULL) { + raid_bdev_remove_base_bdev(base_bdev); + } + } + + raid_bdev_config_destroy(raid_cfg); + + free_rpc_destroy_raid_bdev(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("destroy_raid_bdev", spdk_rpc_destroy_raid_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/rbd/Makefile b/src/spdk/lib/bdev/rbd/Makefile new file mode 100644 index 00000000..e7c97aca --- /dev/null +++ b/src/spdk/lib/bdev/rbd/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_rbd.c bdev_rbd_rpc.c +LIBNAME = bdev_rbd + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/rbd/bdev_rbd.c b/src/spdk/lib/bdev/rbd/bdev_rbd.c new file mode 100644 index 00000000..34c2466b --- /dev/null +++ b/src/spdk/lib/bdev/rbd/bdev_rbd.c @@ -0,0 +1,740 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "bdev_rbd.h" + +#include +#include +#include + +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +#define SPDK_RBD_QUEUE_DEPTH 128 + +static int bdev_rbd_count = 0; + +#define BDEV_RBD_POLL_US 50 + +struct bdev_rbd { + struct spdk_bdev disk; + char *rbd_name; + char *pool_name; + rbd_image_info_t info; + TAILQ_ENTRY(bdev_rbd) tailq; + struct spdk_poller *reset_timer; + struct spdk_bdev_io *reset_bdev_io; +}; + +struct bdev_rbd_io_channel { + rados_ioctx_t io_ctx; + rados_t cluster; + struct pollfd pfd; + rbd_image_t image; + struct bdev_rbd *disk; + struct spdk_poller *poller; +}; + +struct bdev_rbd_io { + uint64_t remaining_len; + int num_segments; + bool failed; +}; + +static void +bdev_rbd_free(struct bdev_rbd *rbd) +{ + if (!rbd) { + return; + } + + free(rbd->disk.name); + free(rbd->rbd_name); + free(rbd->pool_name); + free(rbd); +} + +static int +bdev_rados_context_init(const char *rbd_pool_name, rados_t *cluster, + rados_ioctx_t *io_ctx) +{ + int ret; + + ret = rados_create(cluster, NULL); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados_t struct\n"); + return -1; + } + + ret = rados_conf_read_file(*cluster, NULL); + if (ret < 0) { + SPDK_ERRLOG("Failed to read conf file\n"); + rados_shutdown(*cluster); + return -1; + } + + ret = rados_connect(*cluster); + if (ret < 0) { + SPDK_ERRLOG("Failed to connect to rbd_pool\n"); + rados_shutdown(*cluster); + return -1; + } + + ret = rados_ioctx_create(*cluster, rbd_pool_name, io_ctx); + + if (ret < 0) { + SPDK_ERRLOG("Failed to create ioctx\n"); + rados_shutdown(*cluster); + return -1; + } + + return 0; +} + +static int +bdev_rbd_init(const char *rbd_pool_name, const char *rbd_name, rbd_image_info_t *info) +{ + int ret; + rados_t cluster = NULL; + rados_ioctx_t io_ctx = NULL; + rbd_image_t image = NULL; + + ret = bdev_rados_context_init(rbd_pool_name, &cluster, &io_ctx); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados context for rbd_pool=%s\n", + rbd_pool_name); + return -1; + } + + ret = rbd_open(io_ctx, rbd_name, &image, NULL); + if (ret < 0) { + SPDK_ERRLOG("Failed to open specified rbd device\n"); + goto err; + } + ret = rbd_stat(image, info, sizeof(*info)); + rbd_close(image); + if (ret < 0) { + SPDK_ERRLOG("Failed to stat specified rbd device\n"); + goto err; + } + + rados_ioctx_destroy(io_ctx); + return 0; +err: + rados_ioctx_destroy(io_ctx); + rados_shutdown(cluster); + return -1; +} + +static void +bdev_rbd_exit(rbd_image_t image) +{ + rbd_flush(image); + rbd_close(image); +} + +static void +bdev_rbd_finish_aiocb(rbd_completion_t cb, void *arg) +{ + /* Doing nothing here */ +} + +static int +bdev_rbd_start_aio(rbd_image_t image, struct spdk_bdev_io *bdev_io, + void *buf, uint64_t offset, size_t len) +{ + int ret; + rbd_completion_t comp; + + ret = rbd_aio_create_completion(bdev_io, bdev_rbd_finish_aiocb, + &comp); + if (ret < 0) { + return -1; + } + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + ret = rbd_aio_read(image, offset, len, + buf, comp); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { + ret = rbd_aio_write(image, offset, len, + buf, comp); + } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_FLUSH) { + ret = rbd_aio_flush(image, comp); + } + + if (ret < 0) { + rbd_aio_release(comp); + return -1; + } + + return 0; +} + +static int bdev_rbd_library_init(void); + +static int +bdev_rbd_get_ctx_size(void) +{ + return sizeof(struct bdev_rbd_io); +} + +static struct spdk_bdev_module rbd_if = { + .name = "rbd", + .module_init = bdev_rbd_library_init, + .get_ctx_size = bdev_rbd_get_ctx_size, + +}; +SPDK_BDEV_MODULE_REGISTER(&rbd_if) + +static int64_t +bdev_rbd_rw(struct bdev_rbd *disk, struct spdk_io_channel *ch, + struct spdk_bdev_io *bdev_io, struct iovec *iov, + int iovcnt, size_t len, uint64_t offset) +{ + struct bdev_rbd_io *rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); + size_t remaining = len; + int i, rc; + + rbd_io->remaining_len = 0; + rbd_io->num_segments = 0; + rbd_io->failed = false; + + for (i = 0; i < iovcnt && remaining > 0; i++) { + size_t seg_len = spdk_min(remaining, iov[i].iov_len); + + rc = bdev_rbd_start_aio(rbdio_ch->image, bdev_io, iov[i].iov_base, offset, seg_len); + if (rc) { + /* + * This bdev_rbd_start_aio() call failed, but if any previous ones were + * submitted, we need to wait for them to finish. + */ + if (rbd_io->num_segments == 0) { + /* No previous I/O submitted - return error code immediately. */ + return rc; + } + + /* Return and wait for outstanding I/O to complete. */ + rbd_io->failed = true; + return 0; + } + + rbd_io->num_segments++; + rbd_io->remaining_len += seg_len; + + offset += seg_len; + remaining -= seg_len; + } + + return 0; +} + +static int64_t +bdev_rbd_flush(struct bdev_rbd *disk, struct spdk_io_channel *ch, + struct spdk_bdev_io *bdev_io, uint64_t offset, uint64_t nbytes) +{ + struct bdev_rbd_io_channel *rbdio_ch = spdk_io_channel_get_ctx(ch); + + return bdev_rbd_start_aio(rbdio_ch->image, bdev_io, NULL, offset, nbytes); +} + +static int +bdev_rbd_reset_timer(void *arg) +{ + struct bdev_rbd *disk = arg; + + /* + * TODO: This should check if any I/O is still in flight before completing the reset. + * For now, just complete after the timer expires. + */ + spdk_bdev_io_complete(disk->reset_bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + spdk_poller_unregister(&disk->reset_timer); + disk->reset_bdev_io = NULL; + + return -1; +} + +static int +bdev_rbd_reset(struct bdev_rbd *disk, struct spdk_bdev_io *bdev_io) +{ + /* + * HACK: Since librbd doesn't provide any way to cancel outstanding aio, just kick off a + * timer to wait for in-flight I/O to complete. + */ + assert(disk->reset_bdev_io == NULL); + disk->reset_bdev_io = bdev_io; + disk->reset_timer = spdk_poller_register(bdev_rbd_reset_timer, disk, 1 * 1000 * 1000); + + return 0; +} + +static int +bdev_rbd_destruct(void *ctx) +{ + struct bdev_rbd *rbd = ctx; + + spdk_io_device_unregister(rbd, NULL); + + bdev_rbd_free(rbd); + return 0; +} + +static void bdev_rbd_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + int ret; + + ret = bdev_rbd_rw(bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + + if (ret != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static int _bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_rbd_get_buf_cb, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + + case SPDK_BDEV_IO_TYPE_WRITE: + return bdev_rbd_rw((struct bdev_rbd *)bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.iovs, + bdev_io->u.bdev.iovcnt, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen); + + case SPDK_BDEV_IO_TYPE_FLUSH: + return bdev_rbd_flush((struct bdev_rbd *)bdev_io->bdev->ctxt, + ch, + bdev_io, + bdev_io->u.bdev.offset_blocks * bdev_io->bdev->blocklen, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + + case SPDK_BDEV_IO_TYPE_RESET: + return bdev_rbd_reset((struct bdev_rbd *)bdev_io->bdev->ctxt, + bdev_io); + + default: + return -1; + } + return 0; +} + +static void bdev_rbd_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_rbd_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_rbd_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + default: + return false; + } +} + +static int +bdev_rbd_io_poll(void *arg) +{ + struct bdev_rbd_io_channel *ch = arg; + int i, io_status, rc; + rbd_completion_t comps[SPDK_RBD_QUEUE_DEPTH]; + struct spdk_bdev_io *bdev_io; + struct bdev_rbd_io *rbd_io; + + rc = poll(&ch->pfd, 1, 0); + + /* check the return value of poll since we have only one fd for each channel */ + if (rc != 1) { + return 0; + } + + rc = rbd_poll_io_events(ch->image, comps, SPDK_RBD_QUEUE_DEPTH); + for (i = 0; i < rc; i++) { + bdev_io = rbd_aio_get_arg(comps[i]); + rbd_io = (struct bdev_rbd_io *)bdev_io->driver_ctx; + io_status = rbd_aio_get_return_value(comps[i]); + + assert(rbd_io->num_segments > 0); + rbd_io->num_segments--; + + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + if (io_status > 0) { + /* For reads, io_status is the length */ + rbd_io->remaining_len -= io_status; + } + + if (rbd_io->num_segments == 0 && rbd_io->remaining_len != 0) { + rbd_io->failed = true; + } + } else { + /* For others, 0 means success */ + if (io_status != 0) { + rbd_io->failed = true; + } + } + + rbd_aio_release(comps[i]); + + if (rbd_io->num_segments == 0) { + spdk_bdev_io_complete(bdev_io, + rbd_io->failed ? SPDK_BDEV_IO_STATUS_FAILED : SPDK_BDEV_IO_STATUS_SUCCESS); + } + } + + return rc; +} + +static void +bdev_rbd_free_channel(struct bdev_rbd_io_channel *ch) +{ + if (!ch) { + return; + } + + if (ch->image) { + bdev_rbd_exit(ch->image); + } + + if (ch->io_ctx) { + rados_ioctx_destroy(ch->io_ctx); + } + + if (ch->cluster) { + rados_shutdown(ch->cluster); + } + + if (ch->pfd.fd >= 0) { + close(ch->pfd.fd); + } +} + +static void * +bdev_rbd_handle(void *arg) +{ + struct bdev_rbd_io_channel *ch = arg; + void *ret = arg; + + if (rbd_open(ch->io_ctx, ch->disk->rbd_name, &ch->image, NULL) < 0) { + SPDK_ERRLOG("Failed to open specified rbd device\n"); + ret = NULL; + } + + return ret; +} + +static int +bdev_rbd_create_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_io_channel *ch = ctx_buf; + int ret; + + ch->disk = io_device; + ch->image = NULL; + ch->io_ctx = NULL; + ch->pfd.fd = -1; + + ret = bdev_rados_context_init(ch->disk->pool_name, &ch->cluster, &ch->io_ctx); + if (ret < 0) { + SPDK_ERRLOG("Failed to create rados context for rbd_pool=%s\n", + ch->disk->pool_name); + goto err; + } + + if (spdk_call_unaffinitized(bdev_rbd_handle, ch) == NULL) { + goto err; + } + + ch->pfd.fd = eventfd(0, EFD_NONBLOCK); + if (ch->pfd.fd < 0) { + SPDK_ERRLOG("Failed to get eventfd\n"); + goto err; + } + + ch->pfd.events = POLLIN; + ret = rbd_set_image_notification(ch->image, ch->pfd.fd, EVENT_TYPE_EVENTFD); + if (ret < 0) { + SPDK_ERRLOG("Failed to set rbd image notification\n"); + goto err; + } + + ch->poller = spdk_poller_register(bdev_rbd_io_poll, ch, BDEV_RBD_POLL_US); + + return 0; + +err: + bdev_rbd_free_channel(ch); + return -1; +} + +static void +bdev_rbd_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_rbd_io_channel *io_channel = ctx_buf; + + bdev_rbd_free_channel(io_channel); + + spdk_poller_unregister(&io_channel->poller); +} + +static struct spdk_io_channel * +bdev_rbd_get_io_channel(void *ctx) +{ + struct bdev_rbd *rbd_bdev = ctx; + + return spdk_get_io_channel(rbd_bdev); +} + +static int +bdev_rbd_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct bdev_rbd *rbd_bdev = ctx; + + spdk_json_write_name(w, "rbd"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "pool_name"); + spdk_json_write_string(w, rbd_bdev->pool_name); + + spdk_json_write_name(w, "rbd_name"); + spdk_json_write_string(w, rbd_bdev->rbd_name); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +bdev_rbd_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct bdev_rbd *rbd = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_rbd_bdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bdev->name); + spdk_json_write_named_string(w, "pool_name", rbd->pool_name); + spdk_json_write_named_string(w, "rbd_name", rbd->rbd_name); + spdk_json_write_named_uint32(w, "block_size", bdev->blocklen); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table rbd_fn_table = { + .destruct = bdev_rbd_destruct, + .submit_request = bdev_rbd_submit_request, + .io_type_supported = bdev_rbd_io_type_supported, + .get_io_channel = bdev_rbd_get_io_channel, + .dump_info_json = bdev_rbd_dump_info_json, + .write_config_json = bdev_rbd_write_config_json, +}; + +struct spdk_bdev * +spdk_bdev_rbd_create(const char *name, const char *pool_name, const char *rbd_name, + uint32_t block_size) +{ + struct bdev_rbd *rbd; + int ret; + + if ((pool_name == NULL) || (rbd_name == NULL)) { + return NULL; + } + + rbd = calloc(1, sizeof(struct bdev_rbd)); + if (rbd == NULL) { + SPDK_ERRLOG("Failed to allocate bdev_rbd struct\n"); + return NULL; + } + + rbd->rbd_name = strdup(rbd_name); + if (!rbd->rbd_name) { + bdev_rbd_free(rbd); + return NULL; + } + + rbd->pool_name = strdup(pool_name); + if (!rbd->pool_name) { + bdev_rbd_free(rbd); + return NULL; + } + + ret = bdev_rbd_init(rbd->pool_name, rbd_name, &rbd->info); + if (ret < 0) { + bdev_rbd_free(rbd); + SPDK_ERRLOG("Failed to init rbd device\n"); + return NULL; + } + + if (name) { + rbd->disk.name = strdup(name); + } else { + rbd->disk.name = spdk_sprintf_alloc("Ceph%d", bdev_rbd_count); + } + if (!rbd->disk.name) { + bdev_rbd_free(rbd); + return NULL; + } + rbd->disk.product_name = "Ceph Rbd Disk"; + bdev_rbd_count++; + + rbd->disk.write_cache = 0; + rbd->disk.blocklen = block_size; + rbd->disk.blockcnt = rbd->info.size / rbd->disk.blocklen; + rbd->disk.ctxt = rbd; + rbd->disk.fn_table = &rbd_fn_table; + rbd->disk.module = &rbd_if; + + SPDK_NOTICELOG("Add %s rbd disk to lun\n", rbd->disk.name); + + spdk_io_device_register(rbd, bdev_rbd_create_cb, + bdev_rbd_destroy_cb, + sizeof(struct bdev_rbd_io_channel), + rbd_name); + ret = spdk_bdev_register(&rbd->disk); + if (ret) { + spdk_io_device_unregister(rbd, NULL); + bdev_rbd_free(rbd); + return NULL; + } + + return &rbd->disk; +} + +void +spdk_bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, void *cb_arg) +{ + if (!bdev || bdev->module != &rbd_if) { + cb_fn(cb_arg, -ENODEV); + return; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); +} + +static int +bdev_rbd_library_init(void) +{ + int i, rc = 0; + const char *val; + const char *pool_name; + const char *rbd_name; + uint32_t block_size; + + struct spdk_conf_section *sp = spdk_conf_find_section(NULL, "Ceph"); + + if (sp == NULL) { + /* + * Ceph section not found. Do not initialize any rbd LUNS. + */ + goto end; + } + + /* Init rbd block devices */ + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Ceph", i); + if (val == NULL) { + break; + } + + /* get the Rbd_pool name */ + pool_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 0); + if (pool_name == NULL) { + SPDK_ERRLOG("Ceph%d: rbd pool name needs to be provided\n", i); + rc = -1; + goto end; + } + + rbd_name = spdk_conf_section_get_nmval(sp, "Ceph", i, 1); + if (rbd_name == NULL) { + SPDK_ERRLOG("Ceph%d: format error\n", i); + rc = -1; + goto end; + } + + val = spdk_conf_section_get_nmval(sp, "Ceph", i, 2); + + if (val == NULL) { + block_size = 512; /* default value */ + } else { + block_size = (int)strtol(val, NULL, 10); + if (block_size & 0x1ff) { + SPDK_ERRLOG("current block_size = %d, it should be multiple of 512\n", + block_size); + rc = -1; + goto end; + } + } + + if (spdk_bdev_rbd_create(NULL, pool_name, rbd_name, block_size) == NULL) { + rc = -1; + goto end; + } + } + +end: + return rc; +} + +SPDK_LOG_REGISTER_COMPONENT("bdev_rbd", SPDK_LOG_BDEV_RBD) diff --git a/src/spdk/lib/bdev/rbd/bdev_rbd.h b/src/spdk/lib/bdev/rbd/bdev_rbd.h new file mode 100644 index 00000000..dd2448e1 --- /dev/null +++ b/src/spdk/lib/bdev/rbd/bdev_rbd.h @@ -0,0 +1,55 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_RBD_H +#define SPDK_BDEV_RBD_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" + +typedef void (*spdk_delete_rbd_complete)(void *cb_arg, int bdeverrno); + +struct spdk_bdev *spdk_bdev_rbd_create(const char *name, const char *pool_name, + const char *rbd_name, uint32_t block_size); +/** + * Delete rbd bdev. + * + * \param bdev Pointer to rbd bdev. + * \param cb_fn Function to call after deletion. + * \param cb_arg Argument to pass to cb_fn. + */ +void spdk_bdev_rbd_delete(struct spdk_bdev *bdev, spdk_delete_rbd_complete cb_fn, + void *cb_arg); + +#endif // SPDK_BDEV_RBD_H diff --git a/src/spdk/lib/bdev/rbd/bdev_rbd_rpc.c b/src/spdk/lib/bdev/rbd/bdev_rbd_rpc.c new file mode 100644 index 00000000..745a90ed --- /dev/null +++ b/src/spdk/lib/bdev/rbd/bdev_rbd_rpc.c @@ -0,0 +1,157 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "bdev_rbd.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +struct rpc_construct_rbd { + char *name; + char *pool_name; + char *rbd_name; + uint32_t block_size; +}; + +static void +free_rpc_construct_rbd(struct rpc_construct_rbd *req) +{ + free(req->name); + free(req->pool_name); + free(req->rbd_name); +} + +static const struct spdk_json_object_decoder rpc_construct_rbd_decoders[] = { + {"name", offsetof(struct rpc_construct_rbd, name), spdk_json_decode_string, true}, + {"pool_name", offsetof(struct rpc_construct_rbd, pool_name), spdk_json_decode_string}, + {"rbd_name", offsetof(struct rpc_construct_rbd, rbd_name), spdk_json_decode_string}, + {"block_size", offsetof(struct rpc_construct_rbd, block_size), spdk_json_decode_uint32}, +}; + +static void +spdk_rpc_construct_rbd_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_rbd req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_construct_rbd_decoders, + SPDK_COUNTOF(rpc_construct_rbd_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_BDEV_RBD, "spdk_json_decode_object failed\n"); + goto invalid; + } + + bdev = spdk_bdev_rbd_create(req.name, req.pool_name, req.rbd_name, req.block_size); + if (bdev == NULL) { + goto invalid; + } + + free_rpc_construct_rbd(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_construct_rbd(&req); +} +SPDK_RPC_REGISTER("construct_rbd_bdev", spdk_rpc_construct_rbd_bdev, SPDK_RPC_RUNTIME) + +struct rpc_delete_rbd { + char *name; +}; + +static void +free_rpc_delete_rbd(struct rpc_delete_rbd *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_delete_rbd_decoders[] = { + {"name", offsetof(struct rpc_delete_rbd, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_rbd_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_rbd_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_rbd req = {NULL}; + struct spdk_bdev *bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_rbd_decoders, + SPDK_COUNTOF(rpc_delete_rbd_decoders), + &req)) { + rc = -EINVAL; + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + spdk_bdev_rbd_delete(bdev, _spdk_rpc_delete_rbd_bdev_cb, request); + free_rpc_delete_rbd(&req); + return; + +invalid: + free_rpc_delete_rbd(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("delete_rbd_bdev", spdk_rpc_delete_rbd_bdev, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/rpc/Makefile b/src/spdk/lib/bdev/rpc/Makefile new file mode 100644 index 00000000..4c1fcc0c --- /dev/null +++ b/src/spdk/lib/bdev/rpc/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_rpc.c +LIBNAME = bdev_rpc + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/rpc/bdev_rpc.c b/src/spdk/lib/bdev/rpc/bdev_rpc.c new file mode 100644 index 00000000..1989f6d2 --- /dev/null +++ b/src/spdk/lib/bdev/rpc/bdev_rpc.c @@ -0,0 +1,587 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" + +struct rpc_get_bdevs_iostat_ctx { + int bdev_count; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; +}; + +static void +spdk_rpc_get_bdevs_iostat_cb(struct spdk_bdev *bdev, + struct spdk_bdev_io_stat *stat, void *cb_arg, int rc) +{ + struct rpc_get_bdevs_iostat_ctx *ctx = cb_arg; + struct spdk_json_write_ctx *w = ctx->w; + const char *bdev_name; + + if (rc != 0) { + goto done; + } + + bdev_name = spdk_bdev_get_name(bdev); + if (bdev_name != NULL) { + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, bdev_name); + + spdk_json_write_name(w, "bytes_read"); + spdk_json_write_uint64(w, stat->bytes_read); + + spdk_json_write_name(w, "num_read_ops"); + spdk_json_write_uint64(w, stat->num_read_ops); + + spdk_json_write_name(w, "bytes_written"); + spdk_json_write_uint64(w, stat->bytes_written); + + spdk_json_write_name(w, "num_write_ops"); + spdk_json_write_uint64(w, stat->num_write_ops); + + spdk_json_write_name(w, "read_latency_ticks"); + spdk_json_write_uint64(w, stat->read_latency_ticks); + + spdk_json_write_name(w, "write_latency_ticks"); + spdk_json_write_uint64(w, stat->write_latency_ticks); + + if (spdk_bdev_get_qd_sampling_period(bdev)) { + spdk_json_write_name(w, "queue_depth_polling_period"); + spdk_json_write_uint64(w, spdk_bdev_get_qd_sampling_period(bdev)); + + spdk_json_write_name(w, "queue_depth"); + spdk_json_write_uint64(w, spdk_bdev_get_qd(bdev)); + + spdk_json_write_name(w, "io_time"); + spdk_json_write_uint64(w, spdk_bdev_get_io_time(bdev)); + + spdk_json_write_name(w, "weighted_io_time"); + spdk_json_write_uint64(w, spdk_bdev_get_weighted_io_time(bdev)); + } + + spdk_json_write_object_end(w); + } + +done: + free(stat); + if (--ctx->bdev_count == 0) { + spdk_json_write_array_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + free(ctx); + } +} + +struct rpc_get_bdevs_iostat { + char *name; +}; + +static void +free_rpc_get_bdevs_iostat(struct rpc_get_bdevs_iostat *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_get_bdevs_iostat_decoders[] = { + {"name", offsetof(struct rpc_get_bdevs_iostat, name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_bdevs_iostat(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_bdevs_iostat req = {}; + struct spdk_bdev *bdev = NULL; + struct spdk_json_write_ctx *w; + struct spdk_bdev_io_stat *stat; + struct rpc_get_bdevs_iostat_ctx *ctx; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_get_bdevs_iostat_decoders, + SPDK_COUNTOF(rpc_get_bdevs_iostat_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name) { + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + goto invalid; + } + } + } + + free_rpc_get_bdevs_iostat(&req); + + ctx = calloc(1, sizeof(struct rpc_get_bdevs_iostat_ctx)); + if (ctx == NULL) { + SPDK_ERRLOG("Failed to allocate rpc_get_bdevs_iostat_ctx struct\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "No memory left"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free(ctx); + return; + } + + /* + * Increment initial bdev_count so that it will never reach 0 in the middle + * of iterating. + */ + ctx->bdev_count++; + ctx->request = request; + ctx->w = w; + + spdk_json_write_array_begin(w); + + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "tick_rate"); + spdk_json_write_uint64(w, spdk_get_ticks_hz()); + spdk_json_write_object_end(w); + + if (bdev != NULL) { + stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); + if (stat == NULL) { + SPDK_ERRLOG("Failed to allocate rpc_get_bdevs_iostat_ctx struct\n"); + } else { + ctx->bdev_count++; + spdk_bdev_get_device_stat(bdev, stat, spdk_rpc_get_bdevs_iostat_cb, ctx); + } + } else { + for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) { + stat = calloc(1, sizeof(struct spdk_bdev_io_stat)); + if (stat == NULL) { + SPDK_ERRLOG("Failed to allocate spdk_bdev_io_stat struct\n"); + break; + } + ctx->bdev_count++; + spdk_bdev_get_device_stat(bdev, stat, spdk_rpc_get_bdevs_iostat_cb, ctx); + } + } + + if (--ctx->bdev_count == 0) { + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + free(ctx); + } + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + free_rpc_get_bdevs_iostat(&req); +} +SPDK_RPC_REGISTER("get_bdevs_iostat", spdk_rpc_get_bdevs_iostat, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_dump_bdev_info(struct spdk_json_write_ctx *w, + struct spdk_bdev *bdev) +{ + struct spdk_bdev_alias *tmp; + uint64_t qos_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; + int i; + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + + spdk_json_write_name(w, "aliases"); + spdk_json_write_array_begin(w); + + TAILQ_FOREACH(tmp, spdk_bdev_get_aliases(bdev), tailq) { + spdk_json_write_string(w, tmp->alias); + } + + spdk_json_write_array_end(w); + + spdk_json_write_name(w, "product_name"); + spdk_json_write_string(w, spdk_bdev_get_product_name(bdev)); + + spdk_json_write_name(w, "block_size"); + spdk_json_write_uint32(w, spdk_bdev_get_block_size(bdev)); + + spdk_json_write_name(w, "num_blocks"); + spdk_json_write_uint64(w, spdk_bdev_get_num_blocks(bdev)); + + if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &bdev->uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + spdk_json_write_name(w, "assigned_rate_limits"); + spdk_json_write_object_begin(w); + spdk_bdev_get_qos_rate_limits(bdev, qos_limits); + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + spdk_json_write_name(w, spdk_bdev_get_qos_rpc_type(i)); + spdk_json_write_uint64(w, qos_limits[i]); + } + spdk_json_write_object_end(w); + + spdk_json_write_name(w, "claimed"); + spdk_json_write_bool(w, (bdev->internal.claim_module != NULL)); + + spdk_json_write_name(w, "supported_io_types"); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "read"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)); + spdk_json_write_name(w, "write"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); + spdk_json_write_name(w, "unmap"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)); + spdk_json_write_name(w, "write_zeroes"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)); + spdk_json_write_name(w, "flush"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)); + spdk_json_write_name(w, "reset"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_RESET)); + spdk_json_write_name(w, "nvme_admin"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN)); + spdk_json_write_name(w, "nvme_io"); + spdk_json_write_bool(w, spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO)); + spdk_json_write_object_end(w); + + spdk_json_write_name(w, "driver_specific"); + spdk_json_write_object_begin(w); + spdk_bdev_dump_info_json(bdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +struct rpc_get_bdevs { + char *name; +}; + +static void +free_rpc_get_bdevs(struct rpc_get_bdevs *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_get_bdevs_decoders[] = { + {"name", offsetof(struct rpc_get_bdevs, name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_bdevs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_bdevs req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev = NULL; + + if (params && spdk_json_decode_object(params, rpc_get_bdevs_decoders, + SPDK_COUNTOF(rpc_get_bdevs_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name) { + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + goto invalid; + } + } + + free_rpc_get_bdevs(&req); + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + if (bdev != NULL) { + spdk_rpc_dump_bdev_info(w, bdev); + } else { + for (bdev = spdk_bdev_first(); bdev != NULL; bdev = spdk_bdev_next(bdev)) { + spdk_rpc_dump_bdev_info(w, bdev); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + free_rpc_get_bdevs(&req); +} +SPDK_RPC_REGISTER("get_bdevs", spdk_rpc_get_bdevs, SPDK_RPC_RUNTIME) + +struct rpc_delete_bdev { + char *name; +}; + +static void +free_rpc_delete_bdev(struct rpc_delete_bdev *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_bdev_decoders[] = { + {"name", offsetof(struct rpc_delete_bdev, name), spdk_json_decode_string}, +}; + +static void +_spdk_rpc_delete_bdev_cb(void *cb_arg, int bdeverrno) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, bdeverrno == 0); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_delete_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_bdev req = {}; + struct spdk_bdev *bdev; + + if (spdk_json_decode_object(params, rpc_delete_bdev_decoders, + SPDK_COUNTOF(rpc_delete_bdev_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name == NULL) { + SPDK_ERRLOG("missing name param\n"); + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + goto invalid; + } + + spdk_bdev_unregister(bdev, _spdk_rpc_delete_bdev_cb, request); + + free_rpc_delete_bdev(&req); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_delete_bdev(&req); +} +SPDK_RPC_REGISTER("delete_bdev", spdk_rpc_delete_bdev, SPDK_RPC_RUNTIME) + +struct rpc_set_bdev_qd_sampling_period { + char *name; + uint64_t period; +}; + +static void +free_rpc_set_bdev_qd_sampling_period(struct rpc_set_bdev_qd_sampling_period *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder + rpc_set_bdev_qd_sampling_period_decoders[] = { + {"name", offsetof(struct rpc_set_bdev_qd_sampling_period, name), spdk_json_decode_string}, + {"period", offsetof(struct rpc_set_bdev_qd_sampling_period, period), spdk_json_decode_uint64}, +}; + +static void +spdk_rpc_set_bdev_qd_sampling_period(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_set_bdev_qd_sampling_period req = {0}; + struct spdk_bdev *bdev; + struct spdk_json_write_ctx *w; + + req.period = UINT64_MAX; + + if (spdk_json_decode_object(params, rpc_set_bdev_qd_sampling_period_decoders, + SPDK_COUNTOF(rpc_set_bdev_qd_sampling_period_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name) { + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + goto invalid; + } + } else { + SPDK_ERRLOG("Missing name param\n"); + goto invalid; + } + + if (req.period == UINT64_MAX) { + SPDK_ERRLOG("Missing period param"); + } + + w = spdk_jsonrpc_begin_result(request); + spdk_bdev_set_qd_sampling_period(bdev, req.period); + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + free_rpc_set_bdev_qd_sampling_period(&req); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_set_bdev_qd_sampling_period(&req); + return; +} +SPDK_RPC_REGISTER("set_bdev_qd_sampling_period", + spdk_rpc_set_bdev_qd_sampling_period, + SPDK_RPC_RUNTIME) + +struct rpc_set_bdev_qos_limit { + char *name; + uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; +}; + +static void +free_rpc_set_bdev_qos_limit(struct rpc_set_bdev_qos_limit *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_set_bdev_qos_limit_decoders[] = { + {"name", offsetof(struct rpc_set_bdev_qos_limit, name), spdk_json_decode_string}, + { + "rw_ios_per_sec", offsetof(struct rpc_set_bdev_qos_limit, + limits[SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, + { + "rw_mbytes_per_sec", offsetof(struct rpc_set_bdev_qos_limit, + limits[SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT]), + spdk_json_decode_uint64, true + }, +}; + +static void +spdk_rpc_set_bdev_qos_limit_complete(void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + if (status != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to configure rate limit: %s", + spdk_strerror(-status)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_set_bdev_qos_limit(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_set_bdev_qos_limit req = {NULL, {UINT64_MAX, UINT64_MAX}}; + struct spdk_bdev *bdev; + bool valid_limit = false; + int i; + + if (spdk_json_decode_object(params, rpc_set_bdev_qos_limit_decoders, + SPDK_COUNTOF(rpc_set_bdev_qos_limit_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + bdev = spdk_bdev_get_by_name(req.name); + if (bdev == NULL) { + SPDK_ERRLOG("bdev '%s' does not exist\n", req.name); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Bdev does not exist"); + goto exit; + } + + for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { + if (req.limits[i] != UINT64_MAX) { + valid_limit = true; + } + } + + if (valid_limit == false) { + SPDK_ERRLOG("no rate limits specified\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "No rate limits specified"); + goto exit; + } + + free_rpc_set_bdev_qos_limit(&req); + spdk_bdev_set_qos_rate_limits(bdev, req.limits, spdk_rpc_set_bdev_qos_limit_complete, request); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +exit: + free_rpc_set_bdev_qos_limit(&req); +} + +SPDK_RPC_REGISTER("set_bdev_qos_limit", spdk_rpc_set_bdev_qos_limit, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/scsi_nvme.c b/src/spdk/lib/bdev/scsi_nvme.c new file mode 100644 index 00000000..385b9036 --- /dev/null +++ b/src/spdk/lib/bdev/scsi_nvme.c @@ -0,0 +1,261 @@ +/*- + * BSD LICENSE + * + * Copyright (c) 2016 FUJITSU LIMITED, All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/bdev_module.h" + +#include "spdk/nvme_spec.h" + +void +spdk_scsi_nvme_translate(const struct spdk_bdev_io *bdev_io, int *sc, int *sk, + int *asc, int *ascq) +{ + int nvme_sct = bdev_io->internal.error.nvme.sct; + int nvme_sc = bdev_io->internal.error.nvme.sc; + + switch (nvme_sct) { + case SPDK_NVME_SCT_GENERIC: + switch (nvme_sc) { + case SPDK_NVME_SC_SUCCESS: + *sc = SPDK_SCSI_STATUS_GOOD; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_OPCODE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_FIELD: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_DATA_TRANSFER_ERROR: + case SPDK_NVME_SC_CAPACITY_EXCEEDED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ABORTED_POWER_LOSS: + *sc = SPDK_SCSI_STATUS_TASK_ABORTED; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_WARNING; + *ascq = SPDK_SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_HARDWARE_ERROR; + *asc = SPDK_SCSI_ASC_INTERNAL_TARGET_FAILURE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ABORTED_BY_REQUEST: + case SPDK_NVME_SC_ABORTED_SQ_DELETION: + case SPDK_NVME_SC_ABORTED_FAILED_FUSED: + case SPDK_NVME_SC_ABORTED_MISSING_FUSED: + *sc = SPDK_SCSI_STATUS_TASK_ABORTED; + *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_ACCESS_DENIED; + *ascq = SPDK_SCSI_ASCQ_INVALID_LU_IDENTIFIER; + break; + case SPDK_NVME_SC_LBA_OUT_OF_RANGE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_NAMESPACE_NOT_READY: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_NOT_READY; + *asc = SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_RESERVATION_CONFLICT: + *sc = SPDK_SCSI_STATUS_RESERVATION_CONFLICT; + *sk = SPDK_SCSI_SENSE_NO_SENSE; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_COMMAND_ID_CONFLICT: + case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR: + case SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR: + case SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS: + case SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID: + case SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID: + case SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID: + case SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF: + case SPDK_NVME_SC_INVALID_PRP_OFFSET: + case SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED: + case SPDK_NVME_SC_INVALID_SGL_OFFSET: + case SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT: + case SPDK_NVME_SC_KEEP_ALIVE_EXPIRED: + case SPDK_NVME_SC_KEEP_ALIVE_INVALID: + case SPDK_NVME_SC_FORMAT_IN_PROGRESS: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + switch (nvme_sc) { + case SPDK_NVME_SC_COMPLETION_QUEUE_INVALID: + case SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_FORMAT: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_FORMAT_COMMAND_FAILED; + *ascq = SPDK_SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case SPDK_NVME_SC_CONFLICTING_ATTRIBUTES: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_DATA_PROTECT; + *asc = SPDK_SCSI_ASC_WRITE_PROTECTED; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER: + case SPDK_NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED: + case SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED: + case SPDK_NVME_SC_INVALID_FIRMWARE_SLOT: + case SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE: + case SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR: + case SPDK_NVME_SC_INVALID_LOG_PAGE: + case SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET: + case SPDK_NVME_SC_INVALID_QUEUE_DELETION: + case SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE: + case SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE: + case SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC: + case SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET: + case SPDK_NVME_SC_FIRMWARE_REQ_RESET: + case SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION: + case SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED: + case SPDK_NVME_SC_OVERLAPPING_RANGE: + case SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY: + case SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE: + case SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED: + case SPDK_NVME_SC_NAMESPACE_IS_PRIVATE: + case SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED: + case SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED: + case SPDK_NVME_SC_CONTROLLER_LIST_INVALID: + case SPDK_NVME_SC_INVALID_PROTECTION_INFO: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_MEDIA_ERROR: + switch (nvme_sc) { + case SPDK_NVME_SC_WRITE_FAULTS: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_PERIPHERAL_DEVICE_WRITE_FAULT; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_UNRECOVERED_READ_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_UNRECOVERED_READ_ERROR; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_GUARD_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_GUARD_CHECK_FAILED; + break; + case SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED; + break; + case SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MEDIUM_ERROR; + *asc = SPDK_SCSI_ASC_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + *ascq = SPDK_SCSI_ASCQ_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED; + break; + case SPDK_NVME_SC_COMPARE_FAILURE: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_MISCOMPARE; + *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case SPDK_NVME_SC_ACCESS_DENIED: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_DATA_PROTECT; + *asc = SPDK_SCSI_ASC_ACCESS_DENIED; + *ascq = SPDK_SCSI_ASCQ_NO_ACCESS_RIGHTS; + break; + case SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + break; + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + default: + *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; + *sk = SPDK_SCSI_SENSE_ILLEGAL_REQUEST; + *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; + *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } +} diff --git a/src/spdk/lib/bdev/split/Makefile b/src/spdk/lib/bdev/split/Makefile new file mode 100644 index 00000000..46edf89a --- /dev/null +++ b/src/spdk/lib/bdev/split/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = vbdev_split.c vbdev_split_rpc.c +LIBNAME = vbdev_split + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/split/vbdev_split.c b/src/spdk/lib/bdev/split/vbdev_split.c new file mode 100644 index 00000000..97f11984 --- /dev/null +++ b/src/spdk/lib/bdev/split/vbdev_split.c @@ -0,0 +1,565 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This is a simple example of a virtual block device that takes a single + * bdev and slices it into multiple smaller bdevs. + */ + +#include "vbdev_split.h" + +#include "spdk/rpc.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/util.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +struct spdk_vbdev_split_config { + char *base_bdev; + unsigned split_count; + uint64_t split_size_mb; + + struct spdk_bdev_part_base *split_base; + bool removed; + + TAILQ_ENTRY(spdk_vbdev_split_config) tailq; +}; + +static TAILQ_HEAD(, spdk_vbdev_split_config) g_split_config = TAILQ_HEAD_INITIALIZER( + g_split_config); +static SPDK_BDEV_PART_TAILQ g_split_disks = TAILQ_HEAD_INITIALIZER(g_split_disks); + +struct vbdev_split_channel { + struct spdk_bdev_part_channel part_ch; +}; + +struct vbdev_split_bdev_io { + struct spdk_io_channel *ch; + struct spdk_bdev_io *bdev_io; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; +}; + +static void vbdev_split_del_config(struct spdk_vbdev_split_config *cfg); + +static int vbdev_split_init(void); +static void vbdev_split_fini(void); +static void vbdev_split_examine(struct spdk_bdev *bdev); +static int vbdev_split_config_json(struct spdk_json_write_ctx *w); +static int vbdev_split_get_ctx_size(void); + +static void +vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io); + +static struct spdk_bdev_module split_if = { + .name = "split", + .module_init = vbdev_split_init, + .module_fini = vbdev_split_fini, + .get_ctx_size = vbdev_split_get_ctx_size, + .examine_config = vbdev_split_examine, + .config_json = vbdev_split_config_json, +}; + +SPDK_BDEV_MODULE_REGISTER(&split_if) + +static void +vbdev_split_base_free(void *ctx) +{ + struct spdk_vbdev_split_config *cfg = ctx; + + cfg->split_base = NULL; + if (cfg->removed) { + vbdev_split_del_config(cfg); + } +} + +static int +vbdev_split_destruct(void *ctx) +{ + struct spdk_bdev_part *part = ctx; + + return spdk_bdev_part_free(part); +} + +static void +vbdev_split_base_bdev_hotremove_cb(void *_base_bdev) +{ + spdk_bdev_part_base_hotremove(_base_bdev, &g_split_disks); +} + +static void +vbdev_split_resubmit_io(void *arg) +{ + struct vbdev_split_bdev_io *split_io = (struct vbdev_split_bdev_io *)arg; + + vbdev_split_submit_request(split_io->ch, split_io->bdev_io); +} + +static void +vbdev_split_queue_io(struct vbdev_split_bdev_io *split_io) +{ + int rc; + + split_io->bdev_io_wait.bdev = split_io->bdev_io->bdev; + split_io->bdev_io_wait.cb_fn = vbdev_split_resubmit_io; + split_io->bdev_io_wait.cb_arg = split_io; + + rc = spdk_bdev_queue_io_wait(split_io->bdev_io->bdev, + split_io->ch, &split_io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vbdev_split_queue_io, rc=%d\n", rc); + spdk_bdev_io_complete(split_io->bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +vbdev_split_submit_request(struct spdk_io_channel *_ch, struct spdk_bdev_io *bdev_io) +{ + struct vbdev_split_channel *ch = spdk_io_channel_get_ctx(_ch); + struct vbdev_split_bdev_io *io_ctx = (struct vbdev_split_bdev_io *)bdev_io->driver_ctx; + int rc; + + rc = spdk_bdev_part_submit_request(&ch->part_ch, bdev_io); + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "split: no memory, queue io.\n"); + io_ctx->ch = _ch; + io_ctx->bdev_io = bdev_io; + vbdev_split_queue_io(io_ctx); + } else { + SPDK_ERRLOG("split: error on io submission, rc=%d.\n", rc); + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } + } +} + +static int +vbdev_split_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct spdk_bdev_part *part = ctx; + struct spdk_bdev *split_base_bdev = spdk_bdev_part_get_base_bdev(part); + uint64_t offset_blocks = spdk_bdev_part_get_offset_blocks(part); + + spdk_json_write_name(w, "split"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "base_bdev"); + spdk_json_write_string(w, spdk_bdev_get_name(split_base_bdev)); + spdk_json_write_name(w, "offset_blocks"); + spdk_json_write_uint64(w, offset_blocks); + + spdk_json_write_object_end(w); + + return 0; +} + +static void +vbdev_split_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* No config per bdev needed */ +} + +static struct spdk_bdev_fn_table vbdev_split_fn_table = { + .destruct = vbdev_split_destruct, + .submit_request = vbdev_split_submit_request, + .dump_info_json = vbdev_split_dump_info_json, + .write_config_json = vbdev_split_write_config_json +}; + +static int +vbdev_split_create(struct spdk_vbdev_split_config *cfg) +{ + uint64_t split_size_blocks, offset_blocks; + uint64_t split_count, max_split_count; + uint64_t mb = 1024 * 1024; + uint64_t i; + int rc; + char *name; + struct spdk_bdev *base_bdev; + struct spdk_bdev *split_base_bdev; + struct bdev_part_tailq *split_base_tailq; + + assert(cfg->split_count > 0); + + base_bdev = spdk_bdev_get_by_name(cfg->base_bdev); + if (!base_bdev) { + return -ENODEV; + } + + if (cfg->split_size_mb) { + if (((cfg->split_size_mb * mb) % base_bdev->blocklen) != 0) { + SPDK_ERRLOG("Split size %" PRIu64 " MB is not possible with block size " + "%" PRIu32 "\n", + cfg->split_size_mb, base_bdev->blocklen); + return -EINVAL; + } + split_size_blocks = (cfg->split_size_mb * mb) / base_bdev->blocklen; + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size %" PRIu64 " MB specified by user\n", + cfg->split_size_mb); + } else { + split_size_blocks = base_bdev->blockcnt / cfg->split_count; + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "Split size not specified by user\n"); + } + + max_split_count = base_bdev->blockcnt / split_size_blocks; + split_count = cfg->split_count; + if (split_count > max_split_count) { + SPDK_WARNLOG("Split count %" PRIu64 " is greater than maximum possible split count " + "%" PRIu64 " - clamping\n", split_count, max_split_count); + split_count = max_split_count; + } + + SPDK_DEBUGLOG(SPDK_LOG_VBDEV_SPLIT, "base_bdev: %s split_count: %" PRIu64 + " split_size_blocks: %" PRIu64 "\n", + spdk_bdev_get_name(base_bdev), split_count, split_size_blocks); + + cfg->split_base = spdk_bdev_part_base_construct(base_bdev, + vbdev_split_base_bdev_hotremove_cb, + &split_if, &vbdev_split_fn_table, + &g_split_disks, vbdev_split_base_free, cfg, + sizeof(struct vbdev_split_channel), NULL, NULL); + if (!cfg->split_base) { + SPDK_ERRLOG("Cannot construct bdev part base\n"); + return -ENOMEM; + } + + offset_blocks = 0; + for (i = 0; i < split_count; i++) { + struct spdk_bdev_part *d; + + d = calloc(1, sizeof(*d)); + if (d == NULL) { + SPDK_ERRLOG("could not allocate bdev part\n"); + rc = -ENOMEM; + goto err; + } + + name = spdk_sprintf_alloc("%sp%" PRIu64, cfg->base_bdev, i); + if (!name) { + SPDK_ERRLOG("could not allocate name\n"); + free(d); + rc = -ENOMEM; + goto err; + } + + rc = spdk_bdev_part_construct(d, cfg->split_base, name, offset_blocks, split_size_blocks, + "Split Disk"); + free(name); + if (rc) { + SPDK_ERRLOG("could not construct bdev part\n"); + /* spdk_bdev_part_construct will free name if it fails */ + free(d); + rc = -ENOMEM; + goto err; + } + + offset_blocks += split_size_blocks; + } + + return 0; +err: + split_base_bdev = spdk_bdev_part_base_get_bdev(cfg->split_base); + split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); + cfg->removed = true; + spdk_bdev_part_base_hotremove(split_base_bdev, split_base_tailq); + return rc; +} + +static void +vbdev_split_del_config(struct spdk_vbdev_split_config *cfg) +{ + TAILQ_REMOVE(&g_split_config, cfg, tailq); + free(cfg->base_bdev); + free(cfg); +} + +static void +vbdev_split_destruct_config(struct spdk_vbdev_split_config *cfg) +{ + struct spdk_bdev *split_base_bdev; + struct bdev_part_tailq *split_base_tailq; + + cfg->removed = true; + if (cfg->split_base != NULL) { + split_base_bdev = spdk_bdev_part_base_get_bdev(cfg->split_base); + split_base_tailq = spdk_bdev_part_base_get_tailq(cfg->split_base); + spdk_bdev_part_base_hotremove(split_base_bdev, split_base_tailq); + } else { + vbdev_split_del_config(cfg); + } +} + +static void +vbdev_split_clear_config(void) +{ + struct spdk_vbdev_split_config *cfg, *tmp_cfg; + + TAILQ_FOREACH_SAFE(cfg, &g_split_config, tailq, tmp_cfg) { + vbdev_split_destruct_config(cfg); + } +} + +static struct spdk_vbdev_split_config * +vbdev_split_config_find_by_base_name(const char *base_bdev_name) +{ + struct spdk_vbdev_split_config *cfg; + + TAILQ_FOREACH(cfg, &g_split_config, tailq) { + if (strcmp(cfg->base_bdev, base_bdev_name) == 0) { + return cfg; + } + } + + return NULL; +} + +static int +vbdev_split_add_config(const char *base_bdev_name, unsigned split_count, uint64_t split_size, + struct spdk_vbdev_split_config **config) +{ + struct spdk_vbdev_split_config *cfg; + assert(base_bdev_name); + + if (base_bdev_name == NULL) { + SPDK_ERRLOG("Split bdev config: no base bdev provided."); + return -EINVAL; + } + + if (split_count == 0) { + SPDK_ERRLOG("Split bdev config: split_count can't be 0."); + return -EINVAL; + } + + /* Check if we already have 'base_bdev_name' registered in config */ + cfg = vbdev_split_config_find_by_base_name(base_bdev_name); + if (cfg) { + SPDK_ERRLOG("Split bdev config for base bdev '%s' already exist.", base_bdev_name); + return -EEXIST; + } + + cfg = calloc(1, sizeof(*cfg)); + if (!cfg) { + SPDK_ERRLOG("calloc(): Out of memory"); + return -ENOMEM; + } + + cfg->base_bdev = strdup(base_bdev_name); + if (!cfg->base_bdev) { + SPDK_ERRLOG("strdup(): Out of memory"); + free(cfg); + return -ENOMEM; + } + + cfg->split_count = split_count; + cfg->split_size_mb = split_size; + TAILQ_INSERT_TAIL(&g_split_config, cfg, tailq); + if (config) { + *config = cfg; + } + + return 0; +} + +static int +vbdev_split_init(void) +{ + + struct spdk_conf_section *sp; + const char *base_bdev_name; + const char *split_count_str; + const char *split_size_str; + int rc, i, split_count, split_size; + + sp = spdk_conf_find_section(NULL, "Split"); + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + if (!spdk_conf_section_get_nval(sp, "Split", i)) { + break; + } + + base_bdev_name = spdk_conf_section_get_nmval(sp, "Split", i, 0); + if (!base_bdev_name) { + SPDK_ERRLOG("Split configuration missing bdev name\n"); + rc = -EINVAL; + goto err; + } + + split_count_str = spdk_conf_section_get_nmval(sp, "Split", i, 1); + if (!split_count_str) { + SPDK_ERRLOG("Split configuration missing split count\n"); + rc = -EINVAL; + goto err; + } + + split_count = atoi(split_count_str); + if (split_count < 1) { + SPDK_ERRLOG("Invalid Split count %d\n", split_count); + rc = -EINVAL; + goto err; + } + + /* Optional split size in MB */ + split_size = 0; + split_size_str = spdk_conf_section_get_nmval(sp, "Split", i, 2); + if (split_size_str) { + split_size = atoi(split_size_str); + if (split_size <= 0) { + SPDK_ERRLOG("Invalid Split size %d\n", split_size); + rc = -EINVAL; + goto err; + } + } + + rc = vbdev_split_add_config(base_bdev_name, split_count, split_size, NULL); + if (rc != 0) { + goto err; + } + } + + return 0; +err: + vbdev_split_clear_config(); + return rc; +} + +static void +vbdev_split_fini(void) +{ + vbdev_split_clear_config(); +} + +static void +vbdev_split_examine(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(bdev->name); + + if (cfg != NULL && cfg->removed == false) { + assert(cfg->split_base == NULL); + + if (vbdev_split_create(cfg)) { + SPDK_ERRLOG("could not split bdev %s\n", bdev->name); + } + } + spdk_bdev_module_examine_done(&split_if); +} + +static int +vbdev_split_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_vbdev_split_config *cfg; + + TAILQ_FOREACH(cfg, &g_split_config, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_split_vbdev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "base_bdev", cfg->base_bdev); + spdk_json_write_named_uint32(w, "split_count", cfg->split_count); + spdk_json_write_named_uint64(w, "split_size_mb", cfg->split_size_mb); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + return 0; +} + +int +create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb) +{ + int rc; + struct spdk_vbdev_split_config *cfg; + + rc = vbdev_split_add_config(base_bdev_name, split_count, split_size_mb, &cfg); + if (rc) { + return rc; + } + + rc = vbdev_split_create(cfg); + if (rc == -ENODEV) { + /* It is ok if base bdev does not exist yet. */ + rc = 0; + } + + return rc; +} + +int +spdk_vbdev_split_destruct(const char *base_bdev_name) +{ + struct spdk_vbdev_split_config *cfg = vbdev_split_config_find_by_base_name(base_bdev_name); + + if (!cfg) { + SPDK_ERRLOG("Split configuration for '%s' not found\n", base_bdev_name); + return -ENOENT; + } + + vbdev_split_destruct_config(cfg); + return 0; +} + +struct spdk_bdev_part_base * +spdk_vbdev_split_get_part_base(struct spdk_bdev *bdev) +{ + struct spdk_vbdev_split_config *cfg; + + cfg = vbdev_split_config_find_by_base_name(spdk_bdev_get_name(bdev)); + + if (cfg == NULL) { + return NULL; + } + + return cfg->split_base; +} + +/* + * During init we'll be asked how much memory we'd like passed to us + * in bev_io structures as context. Here's where we specify how + * much context we want per IO. + */ +static int +vbdev_split_get_ctx_size(void) +{ + return sizeof(struct vbdev_split_bdev_io); +} + +SPDK_LOG_REGISTER_COMPONENT("vbdev_split", SPDK_LOG_VBDEV_SPLIT) diff --git a/src/spdk/lib/bdev/split/vbdev_split.h b/src/spdk/lib/bdev/split/vbdev_split.h new file mode 100644 index 00000000..4231d443 --- /dev/null +++ b/src/spdk/lib/bdev/split/vbdev_split.h @@ -0,0 +1,68 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VBDEV_SPLIT_H +#define SPDK_VBDEV_SPLIT_H + +#include "spdk/bdev_module.h" + +/** + * Add given disk name to split config. If bdev with \c base_bdev_name name + * exist the split bdevs will be created right away, if not the split bdevs will + * be created when base bdev became be available (during examination process). + * + * \param base_bdev_name Base bdev name + * \param split_count number of splits to be created. + * \param split_size_mb size of each bdev. If 0 use base bdev size / split_count + * \return value >= 0 - number of splits create. Negative errno code on error. + */ +int create_vbdev_split(const char *base_bdev_name, unsigned split_count, uint64_t split_size_mb); + +/** + * Remove all created split bdevs and split config. + * + * \param base_bdev_name base bdev name + * \return 0 on success or negative errno value. + */ +int spdk_vbdev_split_destruct(const char *base_bdev_name); + +/** + * Get the spdk_bdev_part_base associated with the given split base_bdev. + * + * \param base_bdev Bdev to get the part_base from + * \return pointer to the associated spdk_bdev_part_base + * \return NULL if the base_bdev is not being split by the split module + */ +struct spdk_bdev_part_base *spdk_vbdev_split_get_part_base(struct spdk_bdev *base_bdev); + +#endif // SPDK_VBDEV_SPLIT_H diff --git a/src/spdk/lib/bdev/split/vbdev_split_rpc.c b/src/spdk/lib/bdev/split/vbdev_split_rpc.c new file mode 100644 index 00000000..fe70346f --- /dev/null +++ b/src/spdk/lib/bdev/split/vbdev_split_rpc.c @@ -0,0 +1,151 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "vbdev_split.h" +#include "spdk_internal/log.h" + +struct rpc_construct_split { + char *base_bdev; + uint32_t split_count; + uint64_t split_size_mb; +}; + +static const struct spdk_json_object_decoder rpc_construct_split_decoders[] = { + {"base_bdev", offsetof(struct rpc_construct_split, base_bdev), spdk_json_decode_string}, + {"split_count", offsetof(struct rpc_construct_split, split_count), spdk_json_decode_uint32}, + {"split_size_mb", offsetof(struct rpc_construct_split, split_size_mb), spdk_json_decode_uint64, true}, +}; + +static void +spdk_rpc_construct_split_vbdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_split req = {}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *base_bdev; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_split_decoders, + SPDK_COUNTOF(rpc_construct_split_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = create_vbdev_split(req.base_bdev, req.split_count, req.split_size_mb); + if (rc < 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Failed to create %"PRIu32" split bdevs from '%s': %s", + req.split_count, req.base_bdev, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + goto out; + } + + spdk_json_write_array_begin(w); + + base_bdev = spdk_bdev_get_by_name(req.base_bdev); + if (base_bdev != NULL) { + struct spdk_bdev_part_base *split_base; + struct bdev_part_tailq *split_base_tailq; + struct spdk_bdev_part *split_part; + struct spdk_bdev *split_bdev; + + split_base = spdk_vbdev_split_get_part_base(base_bdev); + + assert(split_base != NULL); + + split_base_tailq = spdk_bdev_part_base_get_tailq(split_base); + TAILQ_FOREACH(split_part, split_base_tailq, tailq) { + split_bdev = spdk_bdev_part_get_bdev(split_part); + spdk_json_write_string(w, spdk_bdev_get_name(split_bdev)); + } + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); + +out: + free(req.base_bdev); +} +SPDK_RPC_REGISTER("construct_split_vbdev", spdk_rpc_construct_split_vbdev, SPDK_RPC_RUNTIME) + +struct rpc_destruct_split { + char *base_bdev; +}; + +static const struct spdk_json_object_decoder rpc_destruct_split_decoders[] = { + {"base_bdev", offsetof(struct rpc_destruct_split, base_bdev), spdk_json_decode_string}, +}; + +static void +spdk_rpc_destruct_split(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_destruct_split req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_destruct_split_decoders, + SPDK_COUNTOF(rpc_destruct_split_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + goto out; + } + + rc = spdk_vbdev_split_destruct(req.base_bdev); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + goto out; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + goto out; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +out: + free(req.base_bdev); +} +SPDK_RPC_REGISTER("destruct_split_vbdev", spdk_rpc_destruct_split, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/bdev/virtio/Makefile b/src/spdk/lib/bdev/virtio/Makefile new file mode 100644 index 00000000..fabe2b9f --- /dev/null +++ b/src/spdk/lib/bdev/virtio/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev_virtio_scsi.c bdev_virtio_blk.c bdev_virtio_rpc.c +LIBNAME = bdev_virtio + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/bdev/virtio/bdev_virtio.h b/src/spdk/lib/bdev/virtio/bdev_virtio.h new file mode 100644 index 00000000..538fab8f --- /dev/null +++ b/src/spdk/lib/bdev/virtio/bdev_virtio.h @@ -0,0 +1,164 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BDEV_VIRTIO_H +#define SPDK_BDEV_VIRTIO_H + +#include "spdk/bdev.h" +#include "spdk/env.h" + +/** + * Callback for creating virtio bdevs. + * + * \param ctx opaque context set by the user + * \param errnum error code. 0 on success, negative errno on error. + * \param bdevs contiguous array of created bdevs + * \param bdev_cnt number of bdevs in the `bdevs` array + */ +typedef void (*bdev_virtio_create_cb)(void *ctx, int errnum, + struct spdk_bdev **bdevs, size_t bdev_cnt); + +/** + * Callback for removing virtio devices. + * + * \param ctx opaque context set by the user + * \param errnum error code. 0 on success, negative errno on error. + */ +typedef void (*bdev_virtio_remove_cb)(void *ctx, int errnum); + +/** + * Connect to a vhost-user Unix domain socket and create a Virtio SCSI device. + * If the connection is successful, the device will be automatically scanned. + * The scan consists of probing the targets on the device and will result in + * creating possibly multiple Virtio SCSI bdevs - one for each target. Currently + * only one LUN per target is detected - LUN0. Note that the bdev creation is + * run asynchronously in the background. After it's finished, the `cb_fn` + * callback is called. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: t + * \param path path to the socket + * \param num_queues max number of request virtqueues to use. `vdev` will be + * started successfully even if the host device supports less queues than requested. + * \param queue_size depth of each queue + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. + * \param cb_arg argument for the `cb_fn` + * \return zero on success (device scan is started) or negative error code. + * In case of error the \c cb_fn is not called. + */ +int bdev_virtio_user_scsi_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size, + bdev_virtio_create_cb cb_fn, void *cb_arg); + +/** + * Attach virtio-pci device. This creates a Virtio SCSI device with the same + * capabilities as the vhost-user equivalent. The device will be automatically + * scanned for exposed SCSI targets. This will result in creating possibly multiple + * Virtio SCSI bdevs - one for each target. Currently only one LUN per target is + * detected - LUN0. Note that the bdev creation is run asynchronously in the + * background. After it's finished, the `cb_fn` callback is called. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: t + * \param pci_addr PCI address of the device to attach + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. + * \param cb_arg argument for the `cb_fn` + * \return zero on success (device scan is started) or negative error code. + * In case of error the \c cb_fn is not called. + */ +int bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, + bdev_virtio_create_cb cb_fn, void *cb_arg); + +/** + * Remove a Virtio device with given name. This will destroy all bdevs exposed + * by this device. + * + * \param name virtio device name + * \param cb_fn function to be called after scanning all targets on the virtio + * device. It's optional, can be NULL. See \c bdev_virtio_create_cb. Possible + * error codes are: + * * ENODEV - couldn't find device with given name + * * EBUSY - device is already being removed + * \param cb_arg argument for the `cb_fn` + * \return zero on success or -ENODEV if scsi dev does not exist + */ +int bdev_virtio_scsi_dev_remove(const char *name, + bdev_virtio_remove_cb cb_fn, void *cb_arg); + +/** + * Remove a Virtio device with given name. + * + * \param bdev virtio blk device bdev + * \param cb_fn function to be called after removing bdev + * \param cb_arg argument for the `cb_fn` + * \return zero on success, -ENODEV if bdev with 'name' does not exist or + * -EINVAL if bdev with 'name' is not a virtio blk device. + */ +int bdev_virtio_blk_dev_remove(const char *name, + bdev_virtio_remove_cb cb_fn, void *cb_arg); + +/** + * List all created Virtio-SCSI devices. + * + * \param write_ctx JSON context to write into + */ +void bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *write_ctx); + +/** + * Connect to a vhost-user Unix domain socket and create a Virtio BLK bdev. + * + * \param name name for the virtio bdev + * \param path path to the socket + * \param num_queues max number of request virtqueues to use. `vdev` will be + * started successfully even if the host device supports less queues than requested. + * \param queue_size depth of each queue + * \return virtio-blk bdev or NULL + */ +struct spdk_bdev *bdev_virtio_user_blk_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size); + +/** + * Attach virtio-pci device. This creates a Virtio BLK device with the same + * capabilities as the vhost-user equivalent. + * + * \param name name for the virtio device. It will be inherited by all created + * bdevs, which are named in the following format: t + * \param pci_addr PCI address of the device to attach + * \return virtio-blk bdev or NULL + */ +struct spdk_bdev *bdev_virtio_pci_blk_dev_create(const char *name, + struct spdk_pci_addr *pci_addr); + +#endif /* SPDK_BDEV_VIRTIO_H */ diff --git a/src/spdk/lib/bdev/virtio/bdev_virtio_blk.c b/src/spdk/lib/bdev/virtio/bdev_virtio_blk.c new file mode 100644 index 00000000..598f7f15 --- /dev/null +++ b/src/spdk/lib/bdev/virtio/bdev_virtio_blk.c @@ -0,0 +1,707 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" + +#include "spdk_internal/assert.h" +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" + +#include + +#include "bdev_virtio.h" + +struct virtio_blk_dev { + struct virtio_dev vdev; + struct spdk_bdev bdev; + bool readonly; +}; + +struct virtio_blk_io_ctx { + struct iovec iov_req; + struct iovec iov_resp; + struct virtio_blk_outhdr req; + uint8_t resp; +}; + +struct bdev_virtio_blk_io_channel { + struct virtio_dev *vdev; + + /** Virtqueue exclusively assigned to this channel. */ + struct virtqueue *vq; + + /** Virtio response poller. */ + struct spdk_poller *poller; +}; + +/* Features desired/implemented by this driver. */ +#define VIRTIO_BLK_DEV_SUPPORTED_FEATURES \ + (1ULL << VIRTIO_BLK_F_BLK_SIZE | \ + 1ULL << VIRTIO_BLK_F_TOPOLOGY | \ + 1ULL << VIRTIO_BLK_F_MQ | \ + 1ULL << VIRTIO_BLK_F_RO | \ + 1ULL << VIRTIO_RING_F_EVENT_IDX | \ + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES) + +static int bdev_virtio_initialize(void); +static int bdev_virtio_blk_get_ctx_size(void); + +static struct spdk_bdev_module virtio_blk_if = { + .name = "virtio_blk", + .module_init = bdev_virtio_initialize, + .get_ctx_size = bdev_virtio_blk_get_ctx_size, +}; + +SPDK_BDEV_MODULE_REGISTER(&virtio_blk_if) + +static int bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf); +static void bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf); + +static struct virtio_blk_io_ctx * +bdev_virtio_blk_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_outhdr *req; + uint8_t *resp; + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + + req = &io_ctx->req; + resp = &io_ctx->resp; + + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + memset(req, 0, sizeof(*req)); + return io_ctx; +} + +static void +bdev_virtio_blk_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_blk_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); + struct virtqueue *vq = virtio_channel->vq; + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } else if (rc != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + bdev_io->type == SPDK_BDEV_IO_TYPE_READ ? + SPDK_VIRTIO_DESC_WR : SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(vq); +} + +static void +bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_io_ctx *io_ctx = bdev_virtio_blk_init_io_vreq(ch, bdev_io); + struct virtio_blk_outhdr *req = &io_ctx->req; + + req->type = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE ? + VIRTIO_BLK_T_OUT : VIRTIO_BLK_T_IN; + + req->sector = bdev_io->u.bdev.offset_blocks * + spdk_bdev_get_block_size(bdev_io->bdev) / 512; + + bdev_virtio_blk_send_io(ch, bdev_io); +} + +static int +_bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_dev *bvdev = bdev_io->bdev->ctxt; + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_rw, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_WRITE: + if (bvdev->readonly) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } else { + bdev_virtio_rw(ch, bdev_io); + } + return 0; + case SPDK_BDEV_IO_TYPE_RESET: + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + return 0; + case SPDK_BDEV_IO_TYPE_UNMAP: + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + return -1; + } + + SPDK_UNREACHABLE(); +} + +static void +bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct virtio_blk_dev *bvdev = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + case SPDK_BDEV_IO_TYPE_WRITE: + return !bvdev->readonly; + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_UNMAP: + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_virtio_get_io_channel(void *ctx) +{ + struct virtio_blk_dev *bvdev = ctx; + + return spdk_get_io_channel(bvdev); +} + +static void +virtio_blk_dev_unregister_cb(void *io_device) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + + virtio_dev_stop(vdev); + virtio_dev_destruct(vdev); + spdk_bdev_destruct_done(&bvdev->bdev, 0); + free(bvdev); +} + +static int +bdev_virtio_disk_destruct(void *ctx) +{ + struct virtio_blk_dev *bvdev = ctx; + + spdk_io_device_unregister(bvdev, virtio_blk_dev_unregister_cb); + return 1; +} + +int +bdev_virtio_blk_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(name); + if (bdev == NULL) { + return -ENODEV; + } + + if (bdev->module != &virtio_blk_if) { + return -ENODEV; + } + + spdk_bdev_unregister(bdev, cb_fn, cb_arg); + + return 0; +} + +static int +bdev_virtio_dump_json_config(void *ctx, struct spdk_json_write_ctx *w) +{ + struct virtio_blk_dev *bvdev = ctx; + + virtio_dev_dump_json_info(&bvdev->vdev, w); + return 0; +} + +static void +bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + struct virtio_blk_dev *bvdev = bdev->ctxt; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_virtio_dev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", bvdev->vdev.name); + spdk_json_write_named_string(w, "dev_type", "blk"); + + /* Write transport specific parameters. */ + bvdev->vdev.backend_ops->write_json_config(&bvdev->vdev, w); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static const struct spdk_bdev_fn_table virtio_fn_table = { + .destruct = bdev_virtio_disk_destruct, + .submit_request = bdev_virtio_submit_request, + .io_type_supported = bdev_virtio_io_type_supported, + .get_io_channel = bdev_virtio_get_io_channel, + .dump_info_json = bdev_virtio_dump_json_config, + .write_config_json = bdev_virtio_write_config_json, +}; + +static void +bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +{ + struct virtio_blk_io_ctx *io_ctx = (struct virtio_blk_io_ctx *)bdev_io->driver_ctx; + + spdk_bdev_io_complete(bdev_io, io_ctx->resp == VIRTIO_BLK_S_OK ? + SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); +} + +static int +bdev_virtio_poll(void *arg) +{ + struct bdev_virtio_blk_io_channel *ch = arg; + void *io[32]; + uint32_t io_len[32]; + uint16_t i, cnt; + + cnt = virtio_recv_pkts(ch->vq, io, io_len, SPDK_COUNTOF(io)); + for (i = 0; i < cnt; ++i) { + bdev_virtio_io_cpl(io[i]); + } + + return cnt; +} + +static int +bdev_virtio_blk_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + struct bdev_virtio_blk_io_channel *ch = ctx_buf; + struct virtqueue *vq; + int32_t queue_idx; + + queue_idx = virtio_dev_find_and_acquire_queue(vdev, 0); + if (queue_idx < 0) { + SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); + return -1; + } + + vq = vdev->vqs[queue_idx]; + + ch->vdev = vdev; + ch->vq = vq; + + ch->poller = spdk_poller_register(bdev_virtio_poll, ch, 0); + return 0; +} + +static void +bdev_virtio_blk_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct virtio_blk_dev *bvdev = io_device; + struct virtio_dev *vdev = &bvdev->vdev; + struct bdev_virtio_blk_io_channel *ch = ctx_buf; + struct virtqueue *vq = ch->vq; + + spdk_poller_unregister(&ch->poller); + virtio_dev_release_queue(vdev, vq->vq_queue_index); +} + +static int +virtio_blk_dev_init(struct virtio_blk_dev *bvdev, uint16_t max_queues) +{ + struct virtio_dev *vdev = &bvdev->vdev; + struct spdk_bdev *bdev = &bvdev->bdev; + uint64_t capacity, num_blocks; + uint32_t block_size; + uint16_t host_max_queues; + int rc; + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_BLK_SIZE)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, blk_size), + &block_size, sizeof(block_size)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + + if (block_size == 0 || block_size % 512 != 0) { + SPDK_ERRLOG("%s: invalid block size (%"PRIu32"). Must be " + "a multiple of 512.\n", vdev->name, block_size); + return -EIO; + } + } else { + block_size = 512; + } + + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, capacity), + &capacity, sizeof(capacity)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + + /* `capacity` is a number of 512-byte sectors. */ + num_blocks = capacity * 512 / block_size; + if (num_blocks == 0) { + SPDK_ERRLOG("%s: size too small (size: %"PRIu64", blocksize: %"PRIu32").\n", + vdev->name, capacity * 512, block_size); + return -EIO; + } + + if ((capacity * 512) % block_size != 0) { + SPDK_WARNLOG("%s: size has been rounded down to the nearest block size boundary. " + "(block size: %"PRIu32", previous size: %"PRIu64", new size: %"PRIu64")\n", + vdev->name, block_size, capacity * 512, num_blocks * block_size); + } + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), + &host_max_queues, sizeof(host_max_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + return rc; + } + } else { + host_max_queues = 1; + } + + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_RO)) { + bvdev->readonly = true; + } + + if (max_queues == 0) { + SPDK_ERRLOG("%s: requested 0 request queues (%"PRIu16" available).\n", + vdev->name, host_max_queues); + return -EINVAL; + } + + if (max_queues > host_max_queues) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues " + "but only %"PRIu16" available.\n", + vdev->name, max_queues, host_max_queues); + max_queues = host_max_queues; + } + + /* bdev is tied with the virtio device; we can reuse the name */ + bdev->name = vdev->name; + rc = virtio_dev_start(vdev, max_queues, 0); + if (rc != 0) { + return rc; + } + + bdev->product_name = "VirtioBlk Disk"; + bdev->write_cache = 0; + bdev->blocklen = block_size; + bdev->blockcnt = num_blocks; + + bdev->ctxt = bvdev; + bdev->fn_table = &virtio_fn_table; + bdev->module = &virtio_blk_if; + + spdk_io_device_register(bvdev, bdev_virtio_blk_ch_create_cb, + bdev_virtio_blk_ch_destroy_cb, + sizeof(struct bdev_virtio_blk_io_channel), + vdev->name); + + rc = spdk_bdev_register(bdev); + if (rc) { + SPDK_ERRLOG("Failed to register bdev name=%s\n", bdev->name); + spdk_io_device_unregister(bvdev, NULL); + virtio_dev_stop(vdev); + return rc; + } + + return 0; +} + +static struct virtio_blk_dev * +virtio_pci_blk_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +{ + static int pci_dev_counter = 0; + struct virtio_blk_dev *bvdev; + struct virtio_dev *vdev; + char *default_name = NULL; + uint16_t num_queues; + int rc; + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + SPDK_ERRLOG("virtio device calloc failed\n"); + return NULL; + } + vdev = &bvdev->vdev; + + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioBlk%"PRIu32, pci_dev_counter++); + if (default_name == NULL) { + free(vdev); + return NULL; + } + name = default_name; + } + + rc = virtio_pci_dev_init(vdev, name, pci_ctx); + free(default_name); + + if (rc != 0) { + free(bvdev); + return NULL; + } + + rc = virtio_dev_reset(vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + + /* TODO: add a way to limit usable virtqueues */ + if (virtio_dev_has_feature(vdev, VIRTIO_BLK_F_MQ)) { + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_blk_config, num_queues), + &num_queues, sizeof(num_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + } else { + num_queues = 1; + } + + rc = virtio_blk_dev_init(bvdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(bvdev); + return NULL; + } + + return bvdev; +} + +static struct virtio_blk_dev * +virtio_user_blk_dev_create(const char *name, const char *path, + uint16_t num_queues, uint32_t queue_size) +{ + struct virtio_blk_dev *bvdev; + int rc; + + bvdev = calloc(1, sizeof(*bvdev)); + if (bvdev == NULL) { + SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); + return NULL; + } + + rc = virtio_user_dev_init(&bvdev->vdev, name, path, queue_size); + if (rc != 0) { + SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); + free(bvdev); + return NULL; + } + + rc = virtio_dev_reset(&bvdev->vdev, VIRTIO_BLK_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + virtio_dev_destruct(&bvdev->vdev); + free(bvdev); + return NULL; + } + + rc = virtio_blk_dev_init(bvdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(&bvdev->vdev); + free(bvdev); + return NULL; + } + + return bvdev; +} + +struct bdev_virtio_pci_dev_create_ctx { + const char *name; + struct virtio_blk_dev *ret; +}; + +static int +bdev_virtio_pci_blk_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; + + create_ctx->ret = virtio_pci_blk_dev_create(create_ctx->name, pci_ctx); + if (create_ctx->ret == NULL) { + return -1; + } + + return 0; +} + +struct spdk_bdev * +bdev_virtio_pci_blk_dev_create(const char *name, struct spdk_pci_addr *pci_addr) +{ + struct bdev_virtio_pci_dev_create_ctx create_ctx; + + create_ctx.name = name; + create_ctx.ret = NULL; + + virtio_pci_dev_attach(bdev_virtio_pci_blk_dev_create_cb, &create_ctx, + PCI_DEVICE_ID_VIRTIO_BLK_MODERN, pci_addr); + + if (create_ctx.ret == NULL) { + return NULL; + } + + return &create_ctx.ret->bdev; +} + +static int +virtio_pci_blk_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_blk_dev *bvdev; + + bvdev = virtio_pci_blk_dev_create(NULL, pci_ctx); + return bvdev == NULL ? -1 : 0; +} + +static int +bdev_virtio_initialize(void) +{ + struct spdk_conf_section *sp; + struct virtio_blk_dev *bvdev; + char *default_name = NULL; + char *path, *type, *name; + unsigned vdev_num; + int num_queues; + bool enable_pci; + int rc = 0; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + path = spdk_conf_section_get_val(sp, "Path"); + if (path == NULL) { + SPDK_ERRLOG("VirtioUserBlk%u: missing Path\n", vdev_num); + return -1; + } + + type = spdk_conf_section_get_val(sp, "Type"); + if (type == NULL || strcmp(type, "Blk") != 0) { + continue; + } + + num_queues = spdk_conf_section_get_intval(sp, "Queues"); + if (num_queues < 1) { + num_queues = 1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioBlk%u", vdev_num); + name = default_name; + } + + bvdev = virtio_user_blk_dev_create(name, path, num_queues, 512); + free(default_name); + default_name = NULL; + + if (bvdev == NULL) { + return -1; + } + } + + sp = spdk_conf_find_section(NULL, "VirtioPci"); + if (sp == NULL) { + return 0; + } + + enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false); + if (enable_pci) { + rc = virtio_pci_dev_enumerate(virtio_pci_blk_dev_enumerate_cb, NULL, + PCI_DEVICE_ID_VIRTIO_BLK_MODERN); + } + + return rc; +} + +struct spdk_bdev * +bdev_virtio_user_blk_dev_create(const char *name, const char *path, + unsigned num_queues, unsigned queue_size) +{ + struct virtio_blk_dev *bvdev; + + bvdev = virtio_user_blk_dev_create(name, path, num_queues, queue_size); + if (bvdev == NULL) { + return NULL; + } + + return &bvdev->bdev; +} + +static int +bdev_virtio_blk_get_ctx_size(void) +{ + return sizeof(struct virtio_blk_io_ctx); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_blk", SPDK_LOG_VIRTIO_BLK) diff --git a/src/spdk/lib/bdev/virtio/bdev_virtio_rpc.c b/src/spdk/lib/bdev/virtio/bdev_virtio_rpc.c new file mode 100644 index 00000000..e96fb42a --- /dev/null +++ b/src/spdk/lib/bdev/virtio/bdev_virtio_rpc.c @@ -0,0 +1,613 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" + +#include "bdev_virtio.h" + +#define SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT 1 +#define SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE 512 + +struct rpc_construct_virtio_scsi_dev { + char *path; + char *pci_address; + char *name; + uint32_t vq_count; + uint32_t vq_size; + struct spdk_jsonrpc_request *request; + +}; + +static const struct spdk_json_object_decoder rpc_construct_virtio_user_scsi_dev[] = { + {"path", offsetof(struct rpc_construct_virtio_scsi_dev, path), spdk_json_decode_string }, + {"name", offsetof(struct rpc_construct_virtio_scsi_dev, name), spdk_json_decode_string }, + {"vq_count", offsetof(struct rpc_construct_virtio_scsi_dev, vq_size), spdk_json_decode_uint32, true }, + {"vq_size", offsetof(struct rpc_construct_virtio_scsi_dev, vq_size), spdk_json_decode_uint32, true }, +}; + +static void +free_rpc_construct_virtio_scsi_dev(struct rpc_construct_virtio_scsi_dev *req) +{ + if (!req) { + return; + } + + free(req->path); + free(req->pci_address); + free(req->name); + free(req); +} + +static void +rpc_construct_virtio_scsi_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt) +{ + struct rpc_construct_virtio_scsi_dev *req = ctx; + struct spdk_json_write_ctx *w; + size_t i; + + if (result) { + spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-result)); + free_rpc_construct_virtio_scsi_dev(req); + return; + } + + w = spdk_jsonrpc_begin_result(req->request); + if (w) { + spdk_json_write_array_begin(w); + + for (i = 0; i < cnt; i++) { + spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i])); + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(req->request, w); + } + + free_rpc_construct_virtio_scsi_dev(ctx); +} + +static void +spdk_rpc_create_virtio_user_scsi_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_virtio_scsi_dev *req; + int rc; + + SPDK_WARNLOG("construct_virtio_user_scsi_bdev command has been deprecated and will be removed " + "in the subsequent release. Please use construct_virtio_dev instead.\n"); + + req = calloc(1, sizeof(*req)); + if (!req) { + rc = -ENOMEM; + goto invalid; + } + + req->pci_address = NULL; + req->vq_count = SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT; + req->vq_size = SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE; + + if (spdk_json_decode_object(params, rpc_construct_virtio_user_scsi_dev, + SPDK_COUNTOF(rpc_construct_virtio_user_scsi_dev), + req)) { + rc = -EINVAL; + goto invalid; + } + + req->request = request; + rc = bdev_virtio_user_scsi_dev_create(req->name, req->path, req->vq_count, req->vq_size, + rpc_construct_virtio_scsi_dev_cb, req); + if (rc < 0) { + goto invalid; + } + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_construct_virtio_scsi_dev(req); +} +SPDK_RPC_REGISTER("construct_virtio_user_scsi_bdev", spdk_rpc_create_virtio_user_scsi_bdev, + SPDK_RPC_RUNTIME); + +static const struct spdk_json_object_decoder rpc_construct_virtio_pci_scsi_dev[] = { + {"pci_address", offsetof(struct rpc_construct_virtio_scsi_dev, pci_address), spdk_json_decode_string }, + {"name", offsetof(struct rpc_construct_virtio_scsi_dev, name), spdk_json_decode_string }, +}; + +static void +spdk_rpc_construct_virtio_pci_scsi_dev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_virtio_scsi_dev *req; + struct spdk_pci_addr pci_addr; + int rc; + + SPDK_WARNLOG("construct_virtio_pci_scsi_bdev command has been deprecated and will be removed " + "in the subsequent release. Please use construct_virtio_dev instead.\n"); + + req = calloc(1, sizeof(*req)); + if (!req) { + rc = -ENOMEM; + goto invalid; + } + + req->path = NULL; + + if (spdk_json_decode_object(params, rpc_construct_virtio_pci_scsi_dev, + SPDK_COUNTOF(rpc_construct_virtio_pci_scsi_dev), + req)) { + rc = -EINVAL; + goto invalid; + } + + if (spdk_pci_addr_parse(&pci_addr, req->pci_address) != 0) { + SPDK_ERRLOG("Invalid PCI address '%s'\n", req->pci_address); + rc = -EINVAL; + goto invalid; + } + + req->request = request; + rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, + rpc_construct_virtio_scsi_dev_cb, req); + if (rc < 0) { + goto invalid; + } + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_construct_virtio_scsi_dev(req); +} +SPDK_RPC_REGISTER("construct_virtio_pci_scsi_bdev", spdk_rpc_construct_virtio_pci_scsi_dev, + SPDK_RPC_RUNTIME); + +struct rpc_remove_virtio_dev { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_remove_virtio_dev[] = { + {"name", offsetof(struct rpc_remove_virtio_dev, name), spdk_json_decode_string }, +}; + +static void +spdk_rpc_remove_virtio_scsi_bdev_cb(void *ctx, int errnum) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w; + + if (errnum != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-errnum)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_remove_virtio_scsi_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_virtio_dev req = {NULL}; + int rc; + + SPDK_WARNLOG("remove_virtio_scsi_bdev command has been deprecated and will be removed " + "in the subsequent release. Please use remove_virtio_bdev instead.\n"); + + if (spdk_json_decode_object(params, rpc_remove_virtio_dev, + SPDK_COUNTOF(rpc_remove_virtio_dev), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = bdev_virtio_scsi_dev_remove(req.name, spdk_rpc_remove_virtio_scsi_bdev_cb, request); + if (rc != 0) { + goto invalid; + } + + free(req.name); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free(req.name); +} +SPDK_RPC_REGISTER("remove_virtio_scsi_bdev", spdk_rpc_remove_virtio_scsi_bdev, SPDK_RPC_RUNTIME); + +static void +spdk_rpc_remove_virtio_bdev_cb(void *ctx, int errnum) +{ + struct spdk_jsonrpc_request *request = ctx; + struct spdk_json_write_ctx *w; + + if (errnum != 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-errnum)); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_remove_virtio_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_virtio_dev req = {NULL}; + int rc; + + if (spdk_json_decode_object(params, rpc_remove_virtio_dev, + SPDK_COUNTOF(rpc_remove_virtio_dev), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = bdev_virtio_blk_dev_remove(req.name, spdk_rpc_remove_virtio_bdev_cb, request); + if (rc == -ENODEV) { + rc = bdev_virtio_scsi_dev_remove(req.name, spdk_rpc_remove_virtio_bdev_cb, request); + } + + if (rc != 0) { + goto invalid; + } + + free(req.name); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free(req.name); +} +SPDK_RPC_REGISTER("remove_virtio_bdev", spdk_rpc_remove_virtio_bdev, SPDK_RPC_RUNTIME); + +static void +spdk_rpc_get_virtio_scsi_devs(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_virtio_scsi_devs requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + bdev_virtio_scsi_dev_list(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_virtio_scsi_devs", spdk_rpc_get_virtio_scsi_devs, SPDK_RPC_RUNTIME) + +struct rpc_construct_virtio_blk_dev { + char *path; + char *pci_address; + char *name; + uint32_t vq_count; + uint32_t vq_size; +}; + +static void +free_rpc_construct_virtio_blk_dev(struct rpc_construct_virtio_blk_dev *req) +{ + free(req->path); + free(req->pci_address); + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_construct_virtio_user_blk_dev[] = { + {"path", offsetof(struct rpc_construct_virtio_blk_dev, path), spdk_json_decode_string }, + {"name", offsetof(struct rpc_construct_virtio_blk_dev, name), spdk_json_decode_string }, + {"vq_count", offsetof(struct rpc_construct_virtio_blk_dev, vq_count), spdk_json_decode_uint32, true }, + {"vq_size", offsetof(struct rpc_construct_virtio_blk_dev, vq_size), spdk_json_decode_uint32, true }, +}; + +static void +spdk_rpc_create_virtio_user_blk_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_virtio_blk_dev req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + int rc; + + req.pci_address = NULL; + req.vq_count = SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT; + req.vq_size = SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE; + + SPDK_WARNLOG("construct_virtio_user_blk_bdev command has been deprecated and will be removed " + "in the subsequent release. Please use construct_virtio_dev instead.\n"); + + if (spdk_json_decode_object(params, rpc_construct_virtio_user_blk_dev, + SPDK_COUNTOF(rpc_construct_virtio_user_blk_dev), + &req)) { + free_rpc_construct_virtio_blk_dev(&req); + rc = -EINVAL; + goto invalid; + } + + bdev = bdev_virtio_user_blk_dev_create(req.name, req.path, req.vq_count, req.vq_size); + free_rpc_construct_virtio_blk_dev(&req); + if (bdev == NULL) { + rc = -EINVAL; + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_virtio_user_blk_bdev", spdk_rpc_create_virtio_user_blk_bdev, + SPDK_RPC_RUNTIME); + +static const struct spdk_json_object_decoder rpc_construct_virtio_pci_blk_dev[] = { + {"pci_address", offsetof(struct rpc_construct_virtio_blk_dev, pci_address), spdk_json_decode_string }, + {"name", offsetof(struct rpc_construct_virtio_blk_dev, name), spdk_json_decode_string }, +}; + +static void +spdk_rpc_create_virtio_pci_blk_bdev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_virtio_blk_dev req = {0}; + struct spdk_json_write_ctx *w; + struct spdk_bdev *bdev; + struct spdk_pci_addr pci_addr; + int rc; + + req.pci_address = NULL; + + SPDK_WARNLOG("construct_virtio_pci_blk_bdev command has been deprecated and will be removed " + "in the subsequent release. Please use construct_virtio_dev instead.\n"); + + if (spdk_json_decode_object(params, rpc_construct_virtio_pci_blk_dev, + SPDK_COUNTOF(rpc_construct_virtio_pci_blk_dev), + &req)) { + free_rpc_construct_virtio_blk_dev(&req); + rc = -EINVAL; + goto invalid; + } + + if (spdk_pci_addr_parse(&pci_addr, req.pci_address) != 0) { + SPDK_ERRLOG("Invalid PCI address '%s'\n", req.pci_address); + free_rpc_construct_virtio_blk_dev(&req); + rc = -EINVAL; + goto invalid; + } + + bdev = bdev_virtio_pci_blk_dev_create(req.name, &pci_addr); + free_rpc_construct_virtio_blk_dev(&req); + if (bdev == NULL) { + rc = -EINVAL; + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_virtio_pci_blk_bdev", spdk_rpc_create_virtio_pci_blk_bdev, + SPDK_RPC_RUNTIME); + +struct rpc_construct_virtio_dev { + char *name; + char *trtype; + char *traddr; + char *dev_type; + uint32_t vq_count; + uint32_t vq_size; + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_construct_virtio_dev[] = { + {"name", offsetof(struct rpc_construct_virtio_dev, name), spdk_json_decode_string }, + {"trtype", offsetof(struct rpc_construct_virtio_dev, trtype), spdk_json_decode_string }, + {"traddr", offsetof(struct rpc_construct_virtio_dev, traddr), spdk_json_decode_string }, + {"dev_type", offsetof(struct rpc_construct_virtio_dev, dev_type), spdk_json_decode_string }, + {"vq_count", offsetof(struct rpc_construct_virtio_dev, vq_count), spdk_json_decode_uint32, true }, + {"vq_size", offsetof(struct rpc_construct_virtio_dev, vq_size), spdk_json_decode_uint32, true }, +}; + +static void +free_rpc_construct_virtio_dev(struct rpc_construct_virtio_dev *req) +{ + free(req->name); + free(req->trtype); + free(req->traddr); + free(req->dev_type); + free(req); +} + +static void +spdk_rpc_create_virtio_dev_cb(void *ctx, int result, struct spdk_bdev **bdevs, size_t cnt) +{ + struct rpc_construct_virtio_dev *req = ctx; + struct spdk_json_write_ctx *w; + size_t i; + + if (result) { + spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-result)); + free_rpc_construct_virtio_dev(req); + return; + } + + w = spdk_jsonrpc_begin_result(req->request); + if (w) { + spdk_json_write_array_begin(w); + + for (i = 0; i < cnt; i++) { + spdk_json_write_string(w, spdk_bdev_get_name(bdevs[i])); + } + + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(req->request, w); + } + + free_rpc_construct_virtio_dev(ctx); +} + +static void +spdk_rpc_create_virtio_dev(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_construct_virtio_dev *req; + struct spdk_bdev *bdev; + struct spdk_pci_addr pci_addr; + bool pci; + int rc; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("calloc() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(ENOMEM)); + return; + } + + if (spdk_json_decode_object(params, rpc_construct_virtio_dev, + SPDK_COUNTOF(rpc_construct_virtio_dev), + req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(EINVAL)); + goto invalid; + } + + if (strcmp(req->trtype, "pci") == 0) { + if (req->vq_count != 0 || req->vq_size != 0) { + SPDK_ERRLOG("VQ count or size is not allowed for PCI transport type\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "vq_count or vq_size is not allowed for PCI transport type."); + goto invalid; + } + + if (spdk_pci_addr_parse(&pci_addr, req->traddr) != 0) { + SPDK_ERRLOG("Invalid PCI address '%s'\n", req->traddr); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid PCI address '%s'", req->traddr); + goto invalid; + } + + pci = true; + } else if (strcmp(req->trtype, "user") == 0) { + req->vq_count = req->vq_count == 0 ? SPDK_VIRTIO_USER_DEFAULT_VQ_COUNT : req->vq_count; + req->vq_size = req->vq_size == 0 ? SPDK_VIRTIO_USER_DEFAULT_QUEUE_SIZE : req->vq_size; + pci = false; + } else { + SPDK_ERRLOG("Invalid trtype '%s'\n", req->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid trtype '%s'", req->trtype); + goto invalid; + } + + req->request = request; + if (strcmp(req->dev_type, "blk") == 0) { + if (pci) { + bdev = bdev_virtio_pci_blk_dev_create(req->name, &pci_addr); + } else { + bdev = bdev_virtio_user_blk_dev_create(req->name, req->traddr, req->vq_count, req->vq_size); + } + + /* Virtio blk doesn't use callback so call it manually to send result. */ + rc = bdev ? 0 : -EINVAL; + spdk_rpc_create_virtio_dev_cb(req, rc, &bdev, bdev ? 1 : 0); + } else if (strcmp(req->dev_type, "scsi") == 0) { + if (pci) { + rc = bdev_virtio_pci_scsi_dev_create(req->name, &pci_addr, spdk_rpc_create_virtio_dev_cb, req); + } else { + rc = bdev_virtio_user_scsi_dev_create(req->name, req->traddr, req->vq_count, req->vq_size, + spdk_rpc_create_virtio_dev_cb, req); + } + + if (rc < 0) { + /* In case of error callback is not called so do it manually to send result. */ + spdk_rpc_create_virtio_dev_cb(req, rc, NULL, 0); + } + } else { + SPDK_ERRLOG("Invalid dev_type '%s'\n", req->dev_type); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid dev_type '%s'", req->dev_type); + goto invalid; + } + + return; +invalid: + free_rpc_construct_virtio_dev(req); +} +SPDK_RPC_REGISTER("construct_virtio_dev", spdk_rpc_create_virtio_dev, SPDK_RPC_RUNTIME); diff --git a/src/spdk/lib/bdev/virtio/bdev_virtio_scsi.c b/src/spdk/lib/bdev/virtio/bdev_virtio_scsi.c new file mode 100644 index 00000000..4ff3db4a --- /dev/null +++ b/src/spdk/lib/bdev/virtio/bdev_virtio_scsi.c @@ -0,0 +1,2017 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/json.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" + +#include + +#include "bdev_virtio.h" + +#define BDEV_VIRTIO_MAX_TARGET 64 +#define BDEV_VIRTIO_SCAN_PAYLOAD_SIZE 256 +#define MGMT_POLL_PERIOD_US (1000 * 5) +#define CTRLQ_RING_SIZE 16 +#define SCAN_REQUEST_RETRIES 5 + +/* Number of non-request queues - eventq and controlq */ +#define SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED 2 + +#define VIRTIO_SCSI_EVENTQ_BUFFER_COUNT 16 + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +static int bdev_virtio_initialize(void); +static void bdev_virtio_finish(void); + +struct virtio_scsi_dev { + /* Generic virtio device data. */ + struct virtio_dev vdev; + + /** Detected SCSI LUNs */ + TAILQ_HEAD(, virtio_scsi_disk) luns; + + /** Context for the SCSI target scan. */ + struct virtio_scsi_scan_base *scan_ctx; + + /** Controlq poller. */ + struct spdk_poller *mgmt_poller; + + /** Controlq messages to be sent. */ + struct spdk_ring *ctrlq_ring; + + /** Buffers for the eventq. */ + struct virtio_scsi_eventq_io *eventq_ios; + + /** Device marked for removal. */ + bool removed; + + /** Callback to be called after vdev removal. */ + bdev_virtio_remove_cb remove_cb; + + /** Context for the `remove_cb`. */ + void *remove_ctx; + + TAILQ_ENTRY(virtio_scsi_dev) tailq; +}; + +struct virtio_scsi_io_ctx { + struct iovec iov_req; + struct iovec iov_resp; + union { + struct virtio_scsi_cmd_req req; + struct virtio_scsi_ctrl_tmf_req tmf_req; + }; + union { + struct virtio_scsi_cmd_resp resp; + struct virtio_scsi_ctrl_tmf_resp tmf_resp; + }; +}; + +struct virtio_scsi_eventq_io { + struct iovec iov; + struct virtio_scsi_event ev; +}; + +struct virtio_scsi_scan_info { + uint64_t num_blocks; + uint32_t block_size; + uint8_t target; + bool unmap_supported; + TAILQ_ENTRY(virtio_scsi_scan_info) tailq; +}; + +struct virtio_scsi_scan_base { + struct virtio_scsi_dev *svdev; + + /** I/O channel used for the scan I/O. */ + struct bdev_virtio_io_channel *channel; + + bdev_virtio_create_cb cb_fn; + void *cb_arg; + + /** Scan all targets on the device. */ + bool full_scan; + + /** Start a full rescan after receiving next scan I/O response. */ + bool restart; + + /** Additional targets to be (re)scanned. */ + TAILQ_HEAD(, virtio_scsi_scan_info) scan_queue; + + /** Remaining attempts for sending the current request. */ + unsigned retries; + + /** If set, the last scan I/O needs to be resent */ + bool needs_resend; + + struct virtio_scsi_io_ctx io_ctx; + struct iovec iov; + uint8_t payload[BDEV_VIRTIO_SCAN_PAYLOAD_SIZE]; + + /** Scan results for the current target. */ + struct virtio_scsi_scan_info info; +}; + +struct virtio_scsi_disk { + struct spdk_bdev bdev; + struct virtio_scsi_dev *svdev; + struct virtio_scsi_scan_info info; + + /** Descriptor opened just to be notified of external bdev hotremove. */ + struct spdk_bdev_desc *notify_desc; + + /** Disk marked for removal. */ + bool removed; + TAILQ_ENTRY(virtio_scsi_disk) link; +}; + +struct bdev_virtio_io_channel { + struct virtio_scsi_dev *svdev; + + /** Virtqueue exclusively assigned to this channel. */ + struct virtqueue *vq; + + /** Virtio response poller. */ + struct spdk_poller *poller; +}; + +static TAILQ_HEAD(, virtio_scsi_dev) g_virtio_scsi_devs = + TAILQ_HEAD_INITIALIZER(g_virtio_scsi_devs); + +static pthread_mutex_t g_virtio_scsi_mutex = PTHREAD_MUTEX_INITIALIZER; + +/** Module finish in progress */ +static bool g_bdev_virtio_finish = false; + +/* Features desired/implemented by this driver. */ +#define VIRTIO_SCSI_DEV_SUPPORTED_FEATURES \ + (1ULL << VIRTIO_SCSI_F_INOUT | \ + 1ULL << VIRTIO_SCSI_F_HOTPLUG | \ + 1ULL << VIRTIO_RING_F_EVENT_IDX | \ + 1ULL << VHOST_USER_F_PROTOCOL_FEATURES) + +static void virtio_scsi_dev_unregister_cb(void *io_device); +static void virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, + bdev_virtio_remove_cb cb_fn, void *cb_arg); +static int bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf); +static void bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf); +static void process_scan_resp(struct virtio_scsi_scan_base *base); +static int bdev_virtio_mgmt_poll(void *arg); + +static int +virtio_scsi_dev_send_eventq_io(struct virtqueue *vq, struct virtio_scsi_eventq_io *io) +{ + int rc; + + rc = virtqueue_req_start(vq, io, 1); + if (rc != 0) { + return -1; + } + + virtqueue_req_add_iovs(vq, &io->iov, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_flush(vq); + + return 0; +} + +static int +virtio_scsi_dev_init(struct virtio_scsi_dev *svdev, uint16_t max_queues) +{ + struct virtio_dev *vdev = &svdev->vdev; + struct spdk_ring *ctrlq_ring; + struct virtio_scsi_eventq_io *eventq_io; + struct virtqueue *eventq; + uint16_t i, num_events; + int rc; + + rc = virtio_dev_reset(vdev, VIRTIO_SCSI_DEV_SUPPORTED_FEATURES); + if (rc != 0) { + return rc; + } + + rc = virtio_dev_start(vdev, max_queues, SPDK_VIRTIO_SCSI_QUEUE_NUM_FIXED); + if (rc != 0) { + return rc; + } + + ctrlq_ring = spdk_ring_create(SPDK_RING_TYPE_MP_SC, CTRLQ_RING_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (ctrlq_ring == NULL) { + SPDK_ERRLOG("Failed to allocate send ring for the controlq.\n"); + return -1; + } + + rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_CONTROLQ); + if (rc != 0) { + SPDK_ERRLOG("Failed to acquire the controlq.\n"); + spdk_ring_free(ctrlq_ring); + return -1; + } + + rc = virtio_dev_acquire_queue(vdev, VIRTIO_SCSI_EVENTQ); + if (rc != 0) { + SPDK_ERRLOG("Failed to acquire the eventq.\n"); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + spdk_ring_free(ctrlq_ring); + return -1; + } + + eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; + num_events = spdk_min(eventq->vq_nentries, VIRTIO_SCSI_EVENTQ_BUFFER_COUNT); + svdev->eventq_ios = spdk_dma_zmalloc(sizeof(*svdev->eventq_ios) * num_events, + 0, NULL); + if (svdev->eventq_ios == NULL) { + SPDK_ERRLOG("cannot allocate memory for %"PRIu16" eventq buffers\n", + num_events); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + spdk_ring_free(ctrlq_ring); + return -1; + } + + for (i = 0; i < num_events; i++) { + eventq_io = &svdev->eventq_ios[i]; + eventq_io->iov.iov_base = &eventq_io->ev; + eventq_io->iov.iov_len = sizeof(eventq_io->ev); + virtio_scsi_dev_send_eventq_io(eventq, eventq_io); + } + + svdev->ctrlq_ring = ctrlq_ring; + + svdev->mgmt_poller = spdk_poller_register(bdev_virtio_mgmt_poll, svdev, + MGMT_POLL_PERIOD_US); + + TAILQ_INIT(&svdev->luns); + svdev->scan_ctx = NULL; + svdev->removed = false; + svdev->remove_cb = NULL; + svdev->remove_ctx = NULL; + + spdk_io_device_register(svdev, bdev_virtio_scsi_ch_create_cb, + bdev_virtio_scsi_ch_destroy_cb, + sizeof(struct bdev_virtio_io_channel), + svdev->vdev.name); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_INSERT_TAIL(&g_virtio_scsi_devs, svdev, tailq); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return 0; +} + +static struct virtio_scsi_dev * +virtio_pci_scsi_dev_create(const char *name, struct virtio_pci_ctx *pci_ctx) +{ + static int pci_dev_counter = 0; + struct virtio_scsi_dev *svdev; + struct virtio_dev *vdev; + char *default_name = NULL; + uint32_t num_queues; + int rc; + + svdev = calloc(1, sizeof(*svdev)); + if (svdev == NULL) { + SPDK_ERRLOG("virtio device calloc failed\n"); + return NULL; + } + + vdev = &svdev->vdev; + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioScsi%"PRIu32, pci_dev_counter++); + if (default_name == NULL) { + free(vdev); + return NULL; + } + name = default_name; + } + + rc = virtio_pci_dev_init(vdev, name, pci_ctx); + free(default_name); + + if (rc != 0) { + free(svdev); + return NULL; + } + + rc = virtio_dev_read_dev_config(vdev, offsetof(struct virtio_scsi_config, num_queues), + &num_queues, sizeof(num_queues)); + if (rc) { + SPDK_ERRLOG("%s: config read failed: %s\n", vdev->name, spdk_strerror(-rc)); + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + rc = virtio_scsi_dev_init(svdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + return svdev; +} + +static struct virtio_scsi_dev * +virtio_user_scsi_dev_create(const char *name, const char *path, + uint16_t num_queues, uint32_t queue_size) +{ + struct virtio_scsi_dev *svdev; + struct virtio_dev *vdev; + int rc; + + svdev = calloc(1, sizeof(*svdev)); + if (svdev == NULL) { + SPDK_ERRLOG("calloc failed for virtio device %s: %s\n", name, path); + return NULL; + } + + vdev = &svdev->vdev; + rc = virtio_user_dev_init(vdev, name, path, queue_size); + if (rc != 0) { + SPDK_ERRLOG("Failed to create virito device %s: %s\n", name, path); + free(svdev); + return NULL; + } + + rc = virtio_scsi_dev_init(svdev, num_queues); + if (rc != 0) { + virtio_dev_destruct(vdev); + free(svdev); + return NULL; + } + + return svdev; +} + +static struct virtio_scsi_disk * +virtio_scsi_dev_get_disk_by_id(struct virtio_scsi_dev *svdev, uint8_t target_id) +{ + struct virtio_scsi_disk *disk; + + TAILQ_FOREACH(disk, &svdev->luns, link) { + if (disk->info.target == target_id) { + return disk; + } + } + + return NULL; +} + +static int virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, + bdev_virtio_create_cb cb_fn, void *cb_arg); +static int send_scan_io(struct virtio_scsi_scan_base *base); +static void _virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target); +static int _virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc); +static void _virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum); +static int virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target); + +static int +bdev_virtio_get_ctx_size(void) +{ + return sizeof(struct virtio_scsi_io_ctx); +} + +static int +bdev_virtio_scsi_config_json(struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_virtio_dev"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "name", svdev->vdev.name); + spdk_json_write_named_string(w, "dev_type", "scsi"); + + /* Write transport specific parameters. */ + svdev->vdev.backend_ops->write_json_config(&svdev->vdev, w); + + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + return 0; +} + + +static struct spdk_bdev_module virtio_scsi_if = { + .name = "virtio_scsi", + .module_init = bdev_virtio_initialize, + .module_fini = bdev_virtio_finish, + .get_ctx_size = bdev_virtio_get_ctx_size, + .config_json = bdev_virtio_scsi_config_json, + .async_init = true, + .async_fini = true, +}; + +SPDK_BDEV_MODULE_REGISTER(&virtio_scsi_if) + +static struct virtio_scsi_io_ctx * +bdev_virtio_init_io_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_cmd_req *req; + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_disk *disk = (struct virtio_scsi_disk *)bdev_io->bdev; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + req = &io_ctx->req; + resp = &io_ctx->resp; + + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + memset(req, 0, sizeof(*req)); + req->lun[0] = 1; + req->lun[1] = disk->info.target; + + return io_ctx; +} + +static struct virtio_scsi_io_ctx * +bdev_virtio_init_tmf_vreq(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_ctrl_tmf_req *tmf_req; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + tmf_req = &io_ctx->tmf_req; + tmf_resp = &io_ctx->tmf_resp; + + io_ctx->iov_req.iov_base = tmf_req; + io_ctx->iov_req.iov_len = sizeof(*tmf_req); + io_ctx->iov_resp.iov_base = tmf_resp; + io_ctx->iov_resp.iov_len = sizeof(*tmf_resp); + + memset(tmf_req, 0, sizeof(*tmf_req)); + tmf_req->lun[0] = 1; + tmf_req->lun[1] = disk->info.target; + + return io_ctx; +} + +static void +bdev_virtio_send_io(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_io_channel *virtio_channel = spdk_io_channel_get_ctx(ch); + struct virtqueue *vq = virtio_channel->vq; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(vq, bdev_io, bdev_io->u.bdev.iovcnt + 2); + if (rc == -ENOMEM) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + return; + } else if (rc != 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + return; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + SPDK_VIRTIO_DESC_WR); + } else { + virtqueue_req_add_iovs(vq, bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, + SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + } + + virtqueue_req_flush(vq); +} + +static void +bdev_virtio_rw(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); + struct virtio_scsi_cmd_req *req = &io_ctx->req; + bool is_write = bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE; + + if (disk->info.num_blocks > (1ULL << 32)) { + req->cdb[0] = is_write ? SPDK_SBC_WRITE_16 : SPDK_SBC_READ_16; + to_be64(&req->cdb[2], bdev_io->u.bdev.offset_blocks); + to_be32(&req->cdb[10], bdev_io->u.bdev.num_blocks); + } else { + req->cdb[0] = is_write ? SPDK_SBC_WRITE_10 : SPDK_SBC_READ_10; + to_be32(&req->cdb[2], bdev_io->u.bdev.offset_blocks); + to_be16(&req->cdb[7], bdev_io->u.bdev.num_blocks); + } + + bdev_virtio_send_io(ch, bdev_io); +} + +static void +bdev_virtio_reset(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct bdev_virtio_io_channel *virtio_ch = spdk_io_channel_get_ctx(ch); + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_tmf_vreq(ch, bdev_io); + struct virtio_scsi_ctrl_tmf_req *tmf_req = &io_ctx->tmf_req; + struct virtio_scsi_dev *svdev = virtio_ch->svdev; + size_t enqueued_count; + + tmf_req->type = VIRTIO_SCSI_T_TMF; + tmf_req->subtype = VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET; + + enqueued_count = spdk_ring_enqueue(svdev->ctrlq_ring, (void **)&bdev_io, 1); + if (spdk_likely(enqueued_count == 1)) { + return; + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); + } +} + +static void +bdev_virtio_unmap(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_io_ctx *io_ctx = bdev_virtio_init_io_vreq(ch, bdev_io); + struct virtio_scsi_cmd_req *req = &io_ctx->req; + struct spdk_scsi_unmap_bdesc *desc, *first_desc; + uint8_t *buf; + uint64_t offset_blocks, num_blocks; + uint16_t cmd_len; + + buf = bdev_io->u.bdev.iovs[0].iov_base; + + offset_blocks = bdev_io->u.bdev.offset_blocks; + num_blocks = bdev_io->u.bdev.num_blocks; + + /* (n-1) * 16-byte descriptors */ + first_desc = desc = (struct spdk_scsi_unmap_bdesc *)&buf[8]; + while (num_blocks > UINT32_MAX) { + to_be64(&desc->lba, offset_blocks); + to_be32(&desc->block_count, UINT32_MAX); + memset(&desc->reserved, 0, sizeof(desc->reserved)); + offset_blocks += UINT32_MAX; + num_blocks -= UINT32_MAX; + desc++; + } + + /* The last descriptor with block_count <= UINT32_MAX */ + to_be64(&desc->lba, offset_blocks); + to_be32(&desc->block_count, num_blocks); + memset(&desc->reserved, 0, sizeof(desc->reserved)); + + /* 8-byte header + n * 16-byte block descriptor */ + cmd_len = 8 + (desc - first_desc + 1) * sizeof(struct spdk_scsi_unmap_bdesc); + + req->cdb[0] = SPDK_SBC_UNMAP; + to_be16(&req->cdb[7], cmd_len); + + /* 8-byte header */ + to_be16(&buf[0], cmd_len - 2); /* total length (excluding the length field) */ + to_be16(&buf[2], cmd_len - 8); /* length of block descriptors */ + memset(&buf[4], 0, 4); /* reserved */ + + bdev_virtio_send_io(ch, bdev_io); +} + +static int _bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_disk *disk = SPDK_CONTAINEROF(bdev_io->bdev, struct virtio_scsi_disk, bdev); + + switch (bdev_io->type) { + case SPDK_BDEV_IO_TYPE_READ: + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_rw, + bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); + return 0; + case SPDK_BDEV_IO_TYPE_WRITE: + bdev_virtio_rw(ch, bdev_io); + return 0; + case SPDK_BDEV_IO_TYPE_RESET: + bdev_virtio_reset(ch, bdev_io); + return 0; + case SPDK_BDEV_IO_TYPE_UNMAP: { + uint64_t buf_len = 8 /* header size */ + + (bdev_io->u.bdev.num_blocks + UINT32_MAX - 1) / + UINT32_MAX * sizeof(struct spdk_scsi_unmap_bdesc); + + if (!disk->info.unmap_supported) { + return -1; + } + + if (buf_len > SPDK_BDEV_LARGE_BUF_MAX_SIZE) { + SPDK_ERRLOG("Trying to UNMAP too many blocks: %"PRIu64"\n", + bdev_io->u.bdev.num_blocks); + return -1; + } + spdk_bdev_io_get_buf(bdev_io, bdev_virtio_unmap, buf_len); + return 0; + } + case SPDK_BDEV_IO_TYPE_FLUSH: + default: + return -1; + } + return 0; +} + +static void bdev_virtio_submit_request(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) +{ + if (_bdev_virtio_submit_request(ch, bdev_io) < 0) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static bool +bdev_virtio_io_type_supported(void *ctx, enum spdk_bdev_io_type io_type) +{ + struct virtio_scsi_disk *disk = ctx; + + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: + case SPDK_BDEV_IO_TYPE_RESET: + return true; + + case SPDK_BDEV_IO_TYPE_UNMAP: + return disk->info.unmap_supported; + + default: + return false; + } +} + +static struct spdk_io_channel * +bdev_virtio_get_io_channel(void *ctx) +{ + struct virtio_scsi_disk *disk = ctx; + + return spdk_get_io_channel(disk->svdev); +} + +static int +bdev_virtio_disk_destruct(void *ctx) +{ + struct virtio_scsi_disk *disk = ctx; + struct virtio_scsi_dev *svdev = disk->svdev; + + TAILQ_REMOVE(&svdev->luns, disk, link); + free(disk->bdev.name); + free(disk); + + if (svdev->removed && TAILQ_EMPTY(&svdev->luns)) { + spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); + } + + return 0; +} + +static int +bdev_virtio_dump_info_json(void *ctx, struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_disk *disk = ctx; + + virtio_dev_dump_json_info(&disk->svdev->vdev, w); + return 0; +} + +static void +bdev_virtio_write_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) +{ + /* SCSI targets and LUNS are discovered during scan process so nothing + * to save here. + */ +} + +static const struct spdk_bdev_fn_table virtio_fn_table = { + .destruct = bdev_virtio_disk_destruct, + .submit_request = bdev_virtio_submit_request, + .io_type_supported = bdev_virtio_io_type_supported, + .get_io_channel = bdev_virtio_get_io_channel, + .dump_info_json = bdev_virtio_dump_info_json, + .write_config_json = bdev_virtio_write_config_json, +}; + +static void +get_scsi_status(struct virtio_scsi_cmd_resp *resp, int *sk, int *asc, int *ascq) +{ + /* see spdk_scsi_task_build_sense_data() for sense data details */ + *sk = 0; + *asc = 0; + *ascq = 0; + + if (resp->sense_len < 3) { + return; + } + + *sk = resp->sense[2] & 0xf; + + if (resp->sense_len < 13) { + return; + } + + *asc = resp->sense[12]; + + if (resp->sense_len < 14) { + return; + } + + *ascq = resp->sense[13]; +} + +static void +bdev_virtio_io_cpl(struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int sk, asc, ascq; + + get_scsi_status(&io_ctx->resp, &sk, &asc, &ascq); + spdk_bdev_io_complete_scsi_status(bdev_io, io_ctx->resp.status, sk, asc, ascq); +} + +static int +bdev_virtio_poll(void *arg) +{ + struct bdev_virtio_io_channel *ch = arg; + struct virtio_scsi_dev *svdev = ch->svdev; + struct virtio_scsi_scan_base *scan_ctx = svdev->scan_ctx; + void *io[32]; + uint32_t io_len[32]; + uint16_t i, cnt; + int rc; + + cnt = virtio_recv_pkts(ch->vq, (void **)io, io_len, SPDK_COUNTOF(io)); + for (i = 0; i < cnt; ++i) { + if (spdk_unlikely(scan_ctx && io[i] == &scan_ctx->io_ctx)) { + if (svdev->removed) { + _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); + return -1; + } + + if (scan_ctx->restart) { + scan_ctx->restart = false; + scan_ctx->full_scan = true; + _virtio_scsi_dev_scan_tgt(scan_ctx, 0); + continue; + } + + process_scan_resp(scan_ctx); + continue; + } + + bdev_virtio_io_cpl(io[i]); + } + + if (spdk_unlikely(scan_ctx && scan_ctx->needs_resend)) { + if (svdev->removed) { + _virtio_scsi_dev_scan_finish(scan_ctx, -EINTR); + return -1; + } else if (cnt == 0) { + return 0; + } + + rc = send_scan_io(scan_ctx); + if (rc != 0) { + assert(scan_ctx->retries > 0); + scan_ctx->retries--; + if (scan_ctx->retries == 0) { + SPDK_ERRLOG("Target scan failed unrecoverably with rc = %d.\n", rc); + _virtio_scsi_dev_scan_finish(scan_ctx, rc); + } + } + } + + return cnt; +} + +static void +bdev_virtio_tmf_cpl_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + + if (io_ctx->tmf_resp.response == VIRTIO_SCSI_S_OK) { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); + } else { + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); + } +} + +static void +bdev_virtio_tmf_cpl(struct spdk_bdev_io *bdev_io) +{ + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), bdev_virtio_tmf_cpl_cb, bdev_io); +} + +static void +bdev_virtio_eventq_io_cpl(struct virtio_scsi_dev *svdev, struct virtio_scsi_eventq_io *io) +{ + struct virtio_scsi_event *ev = &io->ev; + struct virtio_scsi_disk *disk; + + if (ev->lun[0] != 1) { + SPDK_WARNLOG("Received an event with invalid data layout.\n"); + goto out; + } + + if (ev->event & VIRTIO_SCSI_T_EVENTS_MISSED) { + ev->event &= ~VIRTIO_SCSI_T_EVENTS_MISSED; + virtio_scsi_dev_scan(svdev, NULL, NULL); + } + + switch (ev->event) { + case VIRTIO_SCSI_T_NO_EVENT: + break; + case VIRTIO_SCSI_T_TRANSPORT_RESET: + switch (ev->reason) { + case VIRTIO_SCSI_EVT_RESET_RESCAN: + virtio_scsi_dev_scan_tgt(svdev, ev->lun[1]); + break; + case VIRTIO_SCSI_EVT_RESET_REMOVED: + disk = virtio_scsi_dev_get_disk_by_id(svdev, ev->lun[1]); + if (disk != NULL) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + break; + default: + break; + } + break; + default: + break; + } + +out: + virtio_scsi_dev_send_eventq_io(svdev->vdev.vqs[VIRTIO_SCSI_EVENTQ], io); +} + +static void +bdev_virtio_tmf_abort_nomem_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_NOMEM); +} + +static void +bdev_virtio_tmf_abort_ioerr_cb(void *ctx) +{ + struct spdk_bdev_io *bdev_io = ctx; + + spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); +} + +static void +bdev_virtio_tmf_abort(struct spdk_bdev_io *bdev_io, int status) +{ + spdk_thread_fn fn; + + if (status == -ENOMEM) { + fn = bdev_virtio_tmf_abort_nomem_cb; + } else { + fn = bdev_virtio_tmf_abort_ioerr_cb; + } + + spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), fn, bdev_io); +} + +static int +bdev_virtio_send_tmf_io(struct virtqueue *ctrlq, struct spdk_bdev_io *bdev_io) +{ + struct virtio_scsi_io_ctx *io_ctx = (struct virtio_scsi_io_ctx *)bdev_io->driver_ctx; + int rc; + + rc = virtqueue_req_start(ctrlq, bdev_io, 2); + if (rc != 0) { + return rc; + } + + virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(ctrlq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(ctrlq); + return 0; +} + +static int +bdev_virtio_mgmt_poll(void *arg) +{ + struct virtio_scsi_dev *svdev = arg; + struct virtio_dev *vdev = &svdev->vdev; + struct virtqueue *eventq = vdev->vqs[VIRTIO_SCSI_EVENTQ]; + struct virtqueue *ctrlq = vdev->vqs[VIRTIO_SCSI_CONTROLQ]; + struct spdk_ring *send_ring = svdev->ctrlq_ring; + void *io[16]; + uint32_t io_len[16]; + uint16_t i, cnt; + int rc; + int total = 0; + + cnt = spdk_ring_dequeue(send_ring, io, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + rc = bdev_virtio_send_tmf_io(ctrlq, io[i]); + if (rc != 0) { + bdev_virtio_tmf_abort(io[i], rc); + } + } + + cnt = virtio_recv_pkts(ctrlq, io, io_len, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + bdev_virtio_tmf_cpl(io[i]); + } + + cnt = virtio_recv_pkts(eventq, io, io_len, SPDK_COUNTOF(io)); + total += cnt; + for (i = 0; i < cnt; ++i) { + bdev_virtio_eventq_io_cpl(svdev, io[i]); + } + + return total; +} + +static int +bdev_virtio_scsi_ch_create_cb(void *io_device, void *ctx_buf) +{ + struct virtio_scsi_dev *svdev = io_device; + struct virtio_dev *vdev = &svdev->vdev; + struct bdev_virtio_io_channel *ch = ctx_buf; + struct virtqueue *vq; + int32_t queue_idx; + + queue_idx = virtio_dev_find_and_acquire_queue(vdev, VIRTIO_SCSI_REQUESTQ); + if (queue_idx < 0) { + SPDK_ERRLOG("Couldn't get an unused queue for the io_channel.\n"); + return -1; + } + + vq = vdev->vqs[queue_idx]; + + ch->svdev = svdev; + ch->vq = vq; + + ch->poller = spdk_poller_register(bdev_virtio_poll, ch, 0); + + return 0; +} + +static void +bdev_virtio_scsi_ch_destroy_cb(void *io_device, void *ctx_buf) +{ + struct bdev_virtio_io_channel *ch = ctx_buf; + struct virtio_scsi_dev *svdev = ch->svdev; + struct virtio_dev *vdev = &svdev->vdev; + struct virtqueue *vq = ch->vq; + + spdk_poller_unregister(&ch->poller); + virtio_dev_release_queue(vdev, vq->vq_queue_index); +} + +static void +_virtio_scsi_dev_scan_finish(struct virtio_scsi_scan_base *base, int errnum) +{ + struct virtio_scsi_dev *svdev = base->svdev; + size_t bdevs_cnt; + struct spdk_bdev *bdevs[BDEV_VIRTIO_MAX_TARGET]; + struct virtio_scsi_disk *disk; + struct virtio_scsi_scan_info *tgt, *next_tgt; + + spdk_put_io_channel(spdk_io_channel_from_ctx(base->channel)); + base->svdev->scan_ctx = NULL; + + TAILQ_FOREACH_SAFE(tgt, &base->scan_queue, tailq, next_tgt) { + TAILQ_REMOVE(&base->scan_queue, tgt, tailq); + free(tgt); + } + + if (base->cb_fn == NULL) { + spdk_dma_free(base); + return; + } + + bdevs_cnt = 0; + if (errnum == 0) { + TAILQ_FOREACH(disk, &svdev->luns, link) { + bdevs[bdevs_cnt] = &disk->bdev; + bdevs_cnt++; + } + } + + base->cb_fn(base->cb_arg, errnum, bdevs, bdevs_cnt); + spdk_dma_free(base); +} + +static int +send_scan_io(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_io_ctx *io_ctx = &base->io_ctx; + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtqueue *vq = base->channel->vq; + int payload_iov_cnt = base->iov.iov_len > 0 ? 1 : 0; + int rc; + + req->lun[0] = 1; + req->lun[1] = base->info.target; + + rc = virtqueue_req_start(vq, io_ctx, 2 + payload_iov_cnt); + if (rc != 0) { + base->needs_resend = true; + return -1; + } + + virtqueue_req_add_iovs(vq, &io_ctx->iov_req, 1, SPDK_VIRTIO_DESC_RO); + virtqueue_req_add_iovs(vq, &io_ctx->iov_resp, 1, SPDK_VIRTIO_DESC_WR); + virtqueue_req_add_iovs(vq, &base->iov, payload_iov_cnt, SPDK_VIRTIO_DESC_WR); + + virtqueue_req_flush(vq); + return 0; +} + +static int +send_inquiry(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *cdb; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; + cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + cdb->opcode = SPDK_SPC_INQUIRY; + to_be16(cdb->alloc_len, BDEV_VIRTIO_SCAN_PAYLOAD_SIZE); + + return send_scan_io(base); +} + +static int +send_inquiry_vpd(struct virtio_scsi_scan_base *base, uint8_t page_code) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = BDEV_VIRTIO_SCAN_PAYLOAD_SIZE; + inquiry_cdb->opcode = SPDK_SPC_INQUIRY; + inquiry_cdb->evpd = 1; + inquiry_cdb->page_code = page_code; + to_be16(inquiry_cdb->alloc_len, base->iov.iov_len); + + return send_scan_io(base); +} + +static int +send_read_cap_10(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = 8; + req->cdb[0] = SPDK_SBC_READ_CAPACITY_10; + + return send_scan_io(base); +} + +static int +send_read_cap_16(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + + base->iov.iov_len = 32; + req->cdb[0] = SPDK_SPC_SERVICE_ACTION_IN_16; + req->cdb[1] = SPDK_SBC_SAI_READ_CAPACITY_16; + to_be32(&req->cdb[10], base->iov.iov_len); + + return send_scan_io(base); +} + +static int +send_test_unit_ready(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + req->cdb[0] = SPDK_SPC_TEST_UNIT_READY; + base->iov.iov_len = 0; + + return send_scan_io(base); +} + +static int +send_start_stop_unit(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + + memset(req, 0, sizeof(*req)); + req->cdb[0] = SPDK_SBC_START_STOP_UNIT; + req->cdb[4] = SPDK_SBC_START_STOP_UNIT_START_BIT; + base->iov.iov_len = 0; + + return send_scan_io(base); +} + +static int +process_scan_start_stop_unit(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); + } + + return -1; +} + +static int +process_scan_test_unit_ready(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + int sk, asc, ascq; + + get_scsi_status(resp, &sk, &asc, &ascq); + + /* check response, get VPD if spun up otherwise send SSU */ + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_SUPPORTED_VPD_PAGES); + } else if (resp->response == VIRTIO_SCSI_S_OK && + resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && + sk == SPDK_SCSI_SENSE_UNIT_ATTENTION && + asc == SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_READY) { + return send_start_stop_unit(base); + } else { + return -1; + } +} + +static int +process_scan_inquiry_standard(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + struct spdk_scsi_cdb_inquiry_data *inquiry_data = + (struct spdk_scsi_cdb_inquiry_data *)base->payload; + + if (resp->status != SPDK_SCSI_STATUS_GOOD) { + return -1; + } + + /* check to make sure its a supported device */ + if (inquiry_data->peripheral_device_type != SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK || + inquiry_data->peripheral_qualifier != SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED) { + SPDK_WARNLOG("Unsupported peripheral device type 0x%02x (qualifier 0x%02x)\n", + inquiry_data->peripheral_device_type, + inquiry_data->peripheral_qualifier); + return -1; + } + + return send_test_unit_ready(base); +} + +static int +process_scan_inquiry_vpd_supported_vpd_pages(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + bool block_provisioning_page_supported = false; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + const uint8_t *vpd_data = base->payload; + const uint8_t *supported_vpd_pages = vpd_data + 4; + uint16_t page_length; + uint16_t num_supported_pages; + uint16_t i; + + page_length = from_be16(vpd_data + 2); + num_supported_pages = spdk_min(page_length, base->iov.iov_len - 4); + + for (i = 0; i < num_supported_pages; i++) { + if (supported_vpd_pages[i] == SPDK_SPC_VPD_BLOCK_THIN_PROVISION) { + block_provisioning_page_supported = true; + break; + } + } + } + + if (block_provisioning_page_supported) { + return send_inquiry_vpd(base, SPDK_SPC_VPD_BLOCK_THIN_PROVISION); + } else { + return send_read_cap_10(base); + } +} + +static int +process_scan_inquiry_vpd_block_thin_provision(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + + base->info.unmap_supported = false; + + if (resp->status == SPDK_SCSI_STATUS_GOOD) { + uint8_t *vpd_data = base->payload; + + base->info.unmap_supported = !!(vpd_data[5] & SPDK_SCSI_UNMAP_LBPU); + } + + SPDK_INFOLOG(SPDK_LOG_VIRTIO, "Target %u: unmap supported = %d\n", + base->info.target, (int)base->info.unmap_supported); + + return send_read_cap_10(base); +} + +static int +process_scan_inquiry(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct spdk_scsi_cdb_inquiry *inquiry_cdb = (struct spdk_scsi_cdb_inquiry *)req->cdb; + + if ((inquiry_cdb->evpd & 1) == 0) { + return process_scan_inquiry_standard(base); + } + + switch (inquiry_cdb->page_code) { + case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: + return process_scan_inquiry_vpd_supported_vpd_pages(base); + case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: + return process_scan_inquiry_vpd_block_thin_provision(base); + default: + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO, "Unexpected VPD page 0x%02x\n", inquiry_cdb->page_code); + return -1; + } +} + +static void +bdev_virtio_disc_notify_remove(void *remove_ctx) +{ + struct virtio_scsi_disk *disk = remove_ctx; + + disk->removed = true; + spdk_bdev_close(disk->notify_desc); +} + +/* To be called only from the thread performing target scan */ +static int +virtio_scsi_dev_add_tgt(struct virtio_scsi_dev *svdev, struct virtio_scsi_scan_info *info) +{ + struct virtio_scsi_disk *disk; + struct spdk_bdev *bdev; + int rc; + + TAILQ_FOREACH(disk, &svdev->luns, link) { + if (disk->info.target == info->target) { + /* Target is already attached and param change is not supported */ + return 0; + } + } + + if (info->block_size == 0 || info->num_blocks == 0) { + SPDK_ERRLOG("%s: invalid target %u: bs=%"PRIu32" blocks=%"PRIu64"\n", + svdev->vdev.name, info->target, info->block_size, info->num_blocks); + return -EINVAL; + } + + disk = calloc(1, sizeof(*disk)); + if (disk == NULL) { + SPDK_ERRLOG("could not allocate disk\n"); + return -ENOMEM; + } + + disk->svdev = svdev; + memcpy(&disk->info, info, sizeof(*info)); + + bdev = &disk->bdev; + bdev->name = spdk_sprintf_alloc("%st%"PRIu8, svdev->vdev.name, info->target); + if (bdev->name == NULL) { + SPDK_ERRLOG("Couldn't alloc memory for the bdev name.\n"); + free(disk); + return -ENOMEM; + } + + bdev->product_name = "Virtio SCSI Disk"; + bdev->write_cache = 0; + bdev->blocklen = disk->info.block_size; + bdev->blockcnt = disk->info.num_blocks; + + bdev->ctxt = disk; + bdev->fn_table = &virtio_fn_table; + bdev->module = &virtio_scsi_if; + + rc = spdk_bdev_register(&disk->bdev); + if (rc) { + SPDK_ERRLOG("Failed to register bdev name=%s\n", disk->bdev.name); + free(bdev->name); + free(disk); + return rc; + } + + rc = spdk_bdev_open(bdev, false, bdev_virtio_disc_notify_remove, disk, &disk->notify_desc); + if (rc) { + assert(false); + } + + TAILQ_INSERT_TAIL(&svdev->luns, disk, link); + return 0; +} + +static int +process_read_cap_10(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + uint64_t max_block; + uint32_t block_size; + uint8_t target_id = req->lun[1]; + int rc; + + if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("READ CAPACITY (10) failed for target %"PRIu8".\n", target_id); + return -1; + } + + block_size = from_be32(base->payload + 4); + max_block = from_be32(base->payload); + + if (max_block == 0xffffffff) { + return send_read_cap_16(base); + } + + base->info.num_blocks = (uint64_t)max_block + 1; + base->info.block_size = block_size; + + rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); + if (rc != 0) { + return rc; + } + + return _virtio_scsi_dev_scan_next(base, 0); +} + +static int +process_read_cap_16(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + uint8_t target_id = req->lun[1]; + int rc; + + if (resp->response != VIRTIO_SCSI_S_OK || resp->status != SPDK_SCSI_STATUS_GOOD) { + SPDK_ERRLOG("READ CAPACITY (16) failed for target %"PRIu8".\n", target_id); + return -1; + } + + base->info.num_blocks = from_be64(base->payload) + 1; + base->info.block_size = from_be32(base->payload + 8); + rc = virtio_scsi_dev_add_tgt(base->svdev, &base->info); + if (rc != 0) { + return rc; + } + + return _virtio_scsi_dev_scan_next(base, 0); +} + +static void +process_scan_resp(struct virtio_scsi_scan_base *base) +{ + struct virtio_scsi_cmd_req *req = &base->io_ctx.req; + struct virtio_scsi_cmd_resp *resp = &base->io_ctx.resp; + int rc, sk, asc, ascq; + uint8_t target_id; + + if (base->io_ctx.iov_req.iov_len < sizeof(struct virtio_scsi_cmd_req) || + base->io_ctx.iov_resp.iov_len < sizeof(struct virtio_scsi_cmd_resp)) { + SPDK_ERRLOG("Received target scan message with invalid length.\n"); + _virtio_scsi_dev_scan_next(base, -EIO); + return; + } + + get_scsi_status(resp, &sk, &asc, &ascq); + target_id = req->lun[1]; + + if (resp->response == VIRTIO_SCSI_S_BAD_TARGET || + resp->response == VIRTIO_SCSI_S_INCORRECT_LUN) { + _virtio_scsi_dev_scan_next(base, -ENODEV); + return; + } + + if (resp->response != VIRTIO_SCSI_S_OK || + (resp->status == SPDK_SCSI_STATUS_CHECK_CONDITION && + sk != SPDK_SCSI_SENSE_ILLEGAL_REQUEST)) { + assert(base->retries > 0); + base->retries--; + if (base->retries == 0) { + SPDK_NOTICELOG("Target %"PRIu8" is present, but unavailable.\n", target_id); + SPDK_TRACEDUMP(SPDK_LOG_VIRTIO, "CDB", req->cdb, sizeof(req->cdb)); + SPDK_TRACEDUMP(SPDK_LOG_VIRTIO, "SENSE DATA", resp->sense, sizeof(resp->sense)); + _virtio_scsi_dev_scan_next(base, -EBUSY); + return; + } + + /* resend the same request */ + rc = send_scan_io(base); + if (rc != 0) { + /* Let response poller do the resend */ + } + return; + } + + base->retries = SCAN_REQUEST_RETRIES; + + switch (req->cdb[0]) { + case SPDK_SPC_INQUIRY: + rc = process_scan_inquiry(base); + break; + case SPDK_SPC_TEST_UNIT_READY: + rc = process_scan_test_unit_ready(base); + break; + case SPDK_SBC_START_STOP_UNIT: + rc = process_scan_start_stop_unit(base); + break; + case SPDK_SBC_READ_CAPACITY_10: + rc = process_read_cap_10(base); + break; + case SPDK_SPC_SERVICE_ACTION_IN_16: + rc = process_read_cap_16(base); + break; + default: + SPDK_ERRLOG("Received invalid target scan message: cdb[0] = %"PRIu8".\n", req->cdb[0]); + rc = -1; + break; + } + + if (rc != 0) { + if (base->needs_resend) { + return; /* Let response poller do the resend */ + } + + _virtio_scsi_dev_scan_next(base, rc); + } +} + +static int +_virtio_scsi_dev_scan_next(struct virtio_scsi_scan_base *base, int rc) +{ + struct virtio_scsi_scan_info *next; + struct virtio_scsi_disk *disk; + uint8_t target_id; + + if (base->full_scan) { + if (rc != 0) { + disk = virtio_scsi_dev_get_disk_by_id(base->svdev, + base->info.target); + if (disk != NULL) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + } + + target_id = base->info.target + 1; + if (target_id < BDEV_VIRTIO_MAX_TARGET) { + _virtio_scsi_dev_scan_tgt(base, target_id); + return 0; + } + + base->full_scan = false; + } + + next = TAILQ_FIRST(&base->scan_queue); + if (next == NULL) { + _virtio_scsi_dev_scan_finish(base, 0); + return 0; + } + + TAILQ_REMOVE(&base->scan_queue, next, tailq); + target_id = next->target; + free(next); + + _virtio_scsi_dev_scan_tgt(base, target_id); + return 0; +} + +static int +virtio_pci_scsi_dev_enumerate_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_scsi_dev *svdev; + + svdev = virtio_pci_scsi_dev_create(NULL, pci_ctx); + return svdev == NULL ? -1 : 0; +} + +static int +bdev_virtio_process_config(void) +{ + struct spdk_conf_section *sp; + struct virtio_scsi_dev *svdev; + char *default_name = NULL; + char *path, *type, *name; + unsigned vdev_num; + int num_queues; + bool enable_pci; + int rc = 0; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VirtioUser")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VirtioUser%u", &vdev_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + rc = -1; + goto out; + } + + path = spdk_conf_section_get_val(sp, "Path"); + if (path == NULL) { + SPDK_ERRLOG("VirtioUser%u: missing Path\n", vdev_num); + rc = -1; + goto out; + } + + type = spdk_conf_section_get_val(sp, "Type"); + if (type != NULL && strcmp(type, "SCSI") != 0) { + continue; + } + + num_queues = spdk_conf_section_get_intval(sp, "Queues"); + if (num_queues < 1) { + num_queues = 1; + } else if (num_queues > SPDK_VIRTIO_MAX_VIRTQUEUES) { + num_queues = SPDK_VIRTIO_MAX_VIRTQUEUES; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + default_name = spdk_sprintf_alloc("VirtioScsi%u", vdev_num); + name = default_name; + } + + svdev = virtio_user_scsi_dev_create(name, path, num_queues, 512); + free(default_name); + default_name = NULL; + + if (svdev == NULL) { + rc = -1; + goto out; + } + } + + sp = spdk_conf_find_section(NULL, "VirtioPci"); + if (sp == NULL) { + return 0; + } + + enable_pci = spdk_conf_section_get_boolval(sp, "Enable", false); + if (enable_pci) { + rc = virtio_pci_dev_enumerate(virtio_pci_scsi_dev_enumerate_cb, NULL, + PCI_DEVICE_ID_VIRTIO_SCSI_MODERN); + } + +out: + return rc; +} + +static int +_virtio_scsi_dev_scan_init(struct virtio_scsi_dev *svdev) +{ + struct virtio_scsi_scan_base *base; + struct spdk_io_channel *io_ch; + struct virtio_scsi_io_ctx *io_ctx; + struct virtio_scsi_cmd_req *req; + struct virtio_scsi_cmd_resp *resp; + + io_ch = spdk_get_io_channel(svdev); + if (io_ch == NULL) { + return -EBUSY; + } + + base = spdk_dma_zmalloc(sizeof(*base), 64, NULL); + if (base == NULL) { + SPDK_ERRLOG("couldn't allocate memory for scsi target scan.\n"); + return -ENOMEM; + } + + base->svdev = svdev; + + base->channel = spdk_io_channel_get_ctx(io_ch); + TAILQ_INIT(&base->scan_queue); + svdev->scan_ctx = base; + + base->iov.iov_base = base->payload; + io_ctx = &base->io_ctx; + req = &io_ctx->req; + resp = &io_ctx->resp; + io_ctx->iov_req.iov_base = req; + io_ctx->iov_req.iov_len = sizeof(*req); + io_ctx->iov_resp.iov_base = resp; + io_ctx->iov_resp.iov_len = sizeof(*resp); + + base->retries = SCAN_REQUEST_RETRIES; + return 0; +} + +static void +_virtio_scsi_dev_scan_tgt(struct virtio_scsi_scan_base *base, uint8_t target) +{ + int rc; + + memset(&base->info, 0, sizeof(base->info)); + base->info.target = target; + + rc = send_inquiry(base); + if (rc) { + /* Let response poller do the resend */ + } +} + +static int +virtio_scsi_dev_scan(struct virtio_scsi_dev *svdev, bdev_virtio_create_cb cb_fn, + void *cb_arg) +{ + struct virtio_scsi_scan_base *base; + struct virtio_scsi_scan_info *tgt, *next_tgt; + int rc; + + if (svdev->scan_ctx) { + if (svdev->scan_ctx->full_scan) { + return -EEXIST; + } + + /* We're about to start a full rescan, so there's no need + * to scan particular targets afterwards. + */ + TAILQ_FOREACH_SAFE(tgt, &svdev->scan_ctx->scan_queue, tailq, next_tgt) { + TAILQ_REMOVE(&svdev->scan_ctx->scan_queue, tgt, tailq); + free(tgt); + } + + svdev->scan_ctx->cb_fn = cb_fn; + svdev->scan_ctx->cb_arg = cb_arg; + svdev->scan_ctx->restart = true; + return 0; + } + + rc = _virtio_scsi_dev_scan_init(svdev); + if (rc != 0) { + return rc; + } + + base = svdev->scan_ctx; + base->cb_fn = cb_fn; + base->cb_arg = cb_arg; + base->full_scan = true; + + _virtio_scsi_dev_scan_tgt(base, 0); + return 0; +} + +static int +virtio_scsi_dev_scan_tgt(struct virtio_scsi_dev *svdev, uint8_t target) +{ + struct virtio_scsi_scan_base *base; + struct virtio_scsi_scan_info *info; + int rc; + + base = svdev->scan_ctx; + if (base) { + info = calloc(1, sizeof(*info)); + if (info == NULL) { + SPDK_ERRLOG("calloc failed\n"); + return -ENOMEM; + } + + info->target = target; + TAILQ_INSERT_TAIL(&base->scan_queue, info, tailq); + return 0; + } + + rc = _virtio_scsi_dev_scan_init(svdev); + if (rc != 0) { + return rc; + } + + base = svdev->scan_ctx; + base->full_scan = true; + _virtio_scsi_dev_scan_tgt(base, target); + return 0; +} + +static void +bdev_virtio_initial_scan_complete(void *ctx, int result, + struct spdk_bdev **bdevs, size_t bdevs_cnt) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + if (svdev->scan_ctx) { + /* another device is still being scanned */ + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return; + } + } + + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_init_done(&virtio_scsi_if); +} + +static int +bdev_virtio_initialize(void) +{ + struct virtio_scsi_dev *svdev, *next_svdev; + int rc; + + rc = bdev_virtio_process_config(); + pthread_mutex_lock(&g_virtio_scsi_mutex); + + if (rc != 0) { + goto err_unlock; + } + + if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { + goto out_unlock; + } + + /* Initialize all created devices and scan available targets */ + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + rc = virtio_scsi_dev_scan(svdev, bdev_virtio_initial_scan_complete, NULL); + if (rc != 0) { + goto err_unlock; + } + } + + pthread_mutex_unlock(&g_virtio_scsi_mutex); + return 0; + +err_unlock: + /* Remove any created devices */ + TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next_svdev) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + +out_unlock: + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_init_done(&virtio_scsi_if); + return rc; +} + +static void +_virtio_scsi_dev_unregister_cb(void *io_device) +{ + struct virtio_scsi_dev *svdev = io_device; + struct virtio_dev *vdev = &svdev->vdev; + bool finish_module; + bdev_virtio_remove_cb remove_cb; + void *remove_ctx; + + assert(spdk_ring_count(svdev->ctrlq_ring) == 0); + spdk_ring_free(svdev->ctrlq_ring); + spdk_poller_unregister(&svdev->mgmt_poller); + + virtio_dev_release_queue(vdev, VIRTIO_SCSI_EVENTQ); + virtio_dev_release_queue(vdev, VIRTIO_SCSI_CONTROLQ); + + virtio_dev_stop(vdev); + virtio_dev_destruct(vdev); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_REMOVE(&g_virtio_scsi_devs, svdev, tailq); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + remove_cb = svdev->remove_cb; + remove_ctx = svdev->remove_ctx; + spdk_dma_free(svdev->eventq_ios); + free(svdev); + + if (remove_cb) { + remove_cb(remove_ctx, 0); + } + + finish_module = TAILQ_EMPTY(&g_virtio_scsi_devs); + + if (g_bdev_virtio_finish && finish_module) { + spdk_bdev_module_finish_done(); + } +} + +static void +virtio_scsi_dev_unregister_cb(void *io_device) +{ + struct virtio_scsi_dev *svdev = io_device; + struct spdk_thread *thread; + + thread = virtio_dev_queue_get_thread(&svdev->vdev, VIRTIO_SCSI_CONTROLQ); + spdk_thread_send_msg(thread, _virtio_scsi_dev_unregister_cb, io_device); +} + +static void +virtio_scsi_dev_remove(struct virtio_scsi_dev *svdev, + bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_disk *disk, *disk_tmp; + bool do_remove = true; + + if (svdev->removed) { + if (cb_fn) { + cb_fn(cb_arg, -EBUSY); + } + return; + } + + svdev->remove_cb = cb_fn; + svdev->remove_ctx = cb_arg; + svdev->removed = true; + + if (svdev->scan_ctx) { + /* The removal will continue after we receive a pending scan I/O. */ + return; + } + + TAILQ_FOREACH_SAFE(disk, &svdev->luns, link, disk_tmp) { + if (!disk->removed) { + spdk_bdev_unregister(&disk->bdev, NULL, NULL); + } + do_remove = false; + } + + if (do_remove) { + spdk_io_device_unregister(svdev, virtio_scsi_dev_unregister_cb); + } +} + +static void +bdev_virtio_finish(void) +{ + struct virtio_scsi_dev *svdev, *next; + + g_bdev_virtio_finish = true; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + if (TAILQ_EMPTY(&g_virtio_scsi_devs)) { + pthread_mutex_unlock(&g_virtio_scsi_mutex); + spdk_bdev_module_finish_done(); + return; + } + + /* Defer module finish until all controllers are removed. */ + TAILQ_FOREACH_SAFE(svdev, &g_virtio_scsi_devs, tailq, next) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); +} + +int +bdev_virtio_user_scsi_dev_create(const char *base_name, const char *path, + unsigned num_queues, unsigned queue_size, + bdev_virtio_create_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_dev *svdev; + int rc; + + svdev = virtio_user_scsi_dev_create(base_name, path, num_queues, queue_size); + if (svdev == NULL) { + return -1; + } + + rc = virtio_scsi_dev_scan(svdev, cb_fn, cb_arg); + if (rc) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + + return rc; +} + +struct bdev_virtio_pci_dev_create_ctx { + const char *name; + bdev_virtio_create_cb cb_fn; + void *cb_arg; +}; + +static int +bdev_virtio_pci_scsi_dev_create_cb(struct virtio_pci_ctx *pci_ctx, void *ctx) +{ + struct virtio_scsi_dev *svdev; + struct bdev_virtio_pci_dev_create_ctx *create_ctx = ctx; + int rc; + + svdev = virtio_pci_scsi_dev_create(create_ctx->name, pci_ctx); + if (svdev == NULL) { + return -1; + } + + rc = virtio_scsi_dev_scan(svdev, create_ctx->cb_fn, create_ctx->cb_arg); + if (rc) { + virtio_scsi_dev_remove(svdev, NULL, NULL); + } + + return rc; +} + +int +bdev_virtio_pci_scsi_dev_create(const char *name, struct spdk_pci_addr *pci_addr, + bdev_virtio_create_cb cb_fn, void *cb_arg) +{ + struct bdev_virtio_pci_dev_create_ctx create_ctx; + + create_ctx.name = name; + create_ctx.cb_fn = cb_fn; + create_ctx.cb_arg = cb_arg; + + return virtio_pci_dev_attach(bdev_virtio_pci_scsi_dev_create_cb, &create_ctx, + PCI_DEVICE_ID_VIRTIO_SCSI_MODERN, pci_addr); +} + +int +bdev_virtio_scsi_dev_remove(const char *name, bdev_virtio_remove_cb cb_fn, void *cb_arg) +{ + struct virtio_scsi_dev *svdev; + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + if (strcmp(svdev->vdev.name, name) == 0) { + break; + } + } + + if (svdev == NULL) { + pthread_mutex_unlock(&g_virtio_scsi_mutex); + SPDK_ERRLOG("Cannot find Virtio-SCSI device named '%s'\n", name); + return -ENODEV; + } + + virtio_scsi_dev_remove(svdev, cb_fn, cb_arg); + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + return 0; +} + +void +bdev_virtio_scsi_dev_list(struct spdk_json_write_ctx *w) +{ + struct virtio_scsi_dev *svdev; + + spdk_json_write_array_begin(w); + + pthread_mutex_lock(&g_virtio_scsi_mutex); + TAILQ_FOREACH(svdev, &g_virtio_scsi_devs, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, svdev->vdev.name); + + virtio_dev_dump_json_info(&svdev->vdev, w); + + spdk_json_write_object_end(w); + } + pthread_mutex_unlock(&g_virtio_scsi_mutex); + + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio", SPDK_LOG_VIRTIO) diff --git a/src/spdk/lib/bdev/vtune.c b/src/spdk/lib/bdev/vtune.c new file mode 100644 index 00000000..2cb48826 --- /dev/null +++ b/src/spdk/lib/bdev/vtune.c @@ -0,0 +1,49 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/config.h" +#if SPDK_CONFIG_VTUNE + +/* Disable warnings triggered by the VTune code */ +#if defined(__GNUC__) && \ + __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic ignored "-Wsign-compare" +#if __GNUC__ >= 7 +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#endif +#endif + +#include "ittnotify_static.c" + +#endif diff --git a/src/spdk/lib/blob/Makefile b/src/spdk/lib/blob/Makefile new file mode 100644 index 00000000..996155bf --- /dev/null +++ b/src/spdk/lib/blob/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = blobstore.c request.c zeroes.c blob_bs_dev.c +LIBNAME = blob + +DIRS-y += bdev + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/blob/bdev/Makefile b/src/spdk/lib/blob/bdev/Makefile new file mode 100644 index 00000000..dbc25dfb --- /dev/null +++ b/src/spdk/lib/blob/bdev/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = blob_bdev.c +LIBNAME = blob_bdev + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/blob/bdev/blob_bdev.c b/src/spdk/lib/blob/bdev/blob_bdev.c new file mode 100644 index 00000000..42293142 --- /dev/null +++ b/src/spdk/lib/blob/bdev/blob_bdev.c @@ -0,0 +1,357 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blob_bdev.h" +#include "spdk/blob.h" +#include "spdk/thread.h" +#include "spdk/log.h" +#include "spdk/endian.h" +#include "spdk/bdev_module.h" + +struct blob_bdev { + struct spdk_bs_dev bs_dev; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + bool claimed; +}; + +struct blob_resubmit { + struct spdk_bdev_io_wait_entry bdev_io_wait; + enum spdk_bdev_io_type io_type; + struct spdk_bs_dev *dev; + struct spdk_io_channel *channel; + void *payload; + int iovcnt; + uint64_t lba; + uint32_t lba_count; + struct spdk_bs_dev_cb_args *cb_args; +}; +static void bdev_blob_resubmit(void *); + +static inline struct spdk_bdev_desc * +__get_desc(struct spdk_bs_dev *dev) +{ + return ((struct blob_bdev *)dev)->desc; +} + +static inline struct spdk_bdev * +__get_bdev(struct spdk_bs_dev *dev) +{ + return ((struct blob_bdev *)dev)->bdev; +} + +static void +bdev_blob_io_complete(struct spdk_bdev_io *bdev_io, bool success, void *arg) +{ + struct spdk_bs_dev_cb_args *cb_args = arg; + int bserrno; + + if (success) { + bserrno = 0; + } else { + bserrno = -EIO; + } + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); + spdk_bdev_free_io(bdev_io); +} + +static void +bdev_blob_queue_io(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + int iovcnt, + uint64_t lba, uint32_t lba_count, enum spdk_bdev_io_type io_type, + struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + struct spdk_bdev *bdev = __get_bdev(dev); + struct blob_resubmit *ctx; + + ctx = calloc(1, sizeof(struct blob_resubmit)); + + if (ctx == NULL) { + SPDK_ERRLOG("Not enough memory to queue io\n"); + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -ENOMEM); + return; + } + + ctx->io_type = io_type; + ctx->dev = dev; + ctx->channel = channel; + ctx->payload = payload; + ctx->iovcnt = iovcnt; + ctx->lba = lba; + ctx->lba_count = lba_count; + ctx->cb_args = cb_args; + ctx->bdev_io_wait.bdev = bdev; + ctx->bdev_io_wait.cb_fn = bdev_blob_resubmit; + ctx->bdev_io_wait.cb_arg = ctx; + + rc = spdk_bdev_queue_io_wait(bdev, channel, &ctx->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed, rc=%d\n", rc); + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + free(ctx); + assert(false); + } +} + +static void +bdev_blob_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_read_blocks(__get_desc(dev), channel, payload, lba, + lba_count, bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, payload, 0, lba, + lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + +static void +bdev_blob_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_write_blocks(__get_desc(dev), channel, payload, lba, + lba_count, bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, payload, 0, lba, + lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + +static void +bdev_blob_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_readv_blocks(__get_desc(dev), channel, iov, iovcnt, lba, + lba_count, bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, + lba_count, SPDK_BDEV_IO_TYPE_READ, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + +static void +bdev_blob_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_writev_blocks(__get_desc(dev), channel, iov, iovcnt, lba, + lba_count, bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, iov, iovcnt, lba, + lba_count, SPDK_BDEV_IO_TYPE_WRITE, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + +static void +bdev_blob_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, + uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int rc; + + rc = spdk_bdev_write_zeroes_blocks(__get_desc(dev), channel, lba, + lba_count, bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, NULL, 0, lba, + lba_count, SPDK_BDEV_IO_TYPE_WRITE_ZEROES, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } +} + +static void +bdev_blob_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, uint64_t lba, + uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; + int rc; + + if (spdk_bdev_io_type_supported(blob_bdev->bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + rc = spdk_bdev_unmap_blocks(__get_desc(dev), channel, lba, lba_count, + bdev_blob_io_complete, cb_args); + if (rc == -ENOMEM) { + bdev_blob_queue_io(dev, channel, NULL, 0, lba, + lba_count, SPDK_BDEV_IO_TYPE_UNMAP, cb_args); + } else if (rc != 0) { + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, rc); + } + } else { + /* + * If the device doesn't support unmap, immediately complete + * the request. Blobstore does not rely on unmap zeroing + * data. + */ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); + } +} + +static void +bdev_blob_resubmit(void *arg) +{ + struct blob_resubmit *ctx = (struct blob_resubmit *) arg; + + switch (ctx->io_type) { + case SPDK_BDEV_IO_TYPE_READ: + if (ctx->iovcnt > 0) { + bdev_blob_readv(ctx->dev, ctx->channel, (struct iovec *)ctx->payload, ctx->iovcnt, + ctx->lba, ctx->lba_count, ctx->cb_args); + } else { + bdev_blob_read(ctx->dev, ctx->channel, ctx->payload, + ctx->lba, ctx->lba_count, ctx->cb_args); + } + break; + case SPDK_BDEV_IO_TYPE_WRITE: + if (ctx->iovcnt > 0) { + bdev_blob_writev(ctx->dev, ctx->channel, (struct iovec *)ctx->payload, ctx->iovcnt, + ctx->lba, ctx->lba_count, ctx->cb_args); + } else { + bdev_blob_write(ctx->dev, ctx->channel, ctx->payload, + ctx->lba, ctx->lba_count, ctx->cb_args); + } + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + bdev_blob_unmap(ctx->dev, ctx->channel, + ctx->lba, ctx->lba_count, ctx->cb_args); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + bdev_blob_write_zeroes(ctx->dev, ctx->channel, + ctx->lba, ctx->lba_count, ctx->cb_args); + break; + default: + SPDK_ERRLOG("Unsupported io type %d\n", ctx->io_type); + assert(false); + break; + } + free(ctx); +} + +int +spdk_bs_bdev_claim(struct spdk_bs_dev *bs_dev, struct spdk_bdev_module *module) +{ + struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; + int rc; + + rc = spdk_bdev_module_claim_bdev(blob_bdev->bdev, NULL, module); + if (rc != 0) { + SPDK_ERRLOG("could not claim bs dev\n"); + return rc; + } + + blob_bdev->claimed = true; + + return rc; +} + +static struct spdk_io_channel * +bdev_blob_create_channel(struct spdk_bs_dev *dev) +{ + struct blob_bdev *blob_bdev = (struct blob_bdev *)dev; + + return spdk_bdev_get_io_channel(blob_bdev->desc); +} + +static void +bdev_blob_destroy_channel(struct spdk_bs_dev *dev, struct spdk_io_channel *channel) +{ + spdk_put_io_channel(channel); +} + +static void +bdev_blob_destroy(struct spdk_bs_dev *bs_dev) +{ + struct spdk_bdev_desc *desc = __get_desc(bs_dev); + struct blob_bdev *blob_bdev = (struct blob_bdev *)bs_dev; + + if (blob_bdev->claimed) { + spdk_bdev_module_release_bdev(blob_bdev->bdev); + } + + spdk_bdev_close(desc); + free(bs_dev); +} + +struct spdk_bs_dev * +spdk_bdev_create_bs_dev(struct spdk_bdev *bdev, spdk_bdev_remove_cb_t remove_cb, void *remove_ctx) +{ + struct blob_bdev *b; + struct spdk_bdev_desc *desc; + int rc; + + b = calloc(1, sizeof(*b)); + + if (b == NULL) { + SPDK_ERRLOG("could not allocate blob_bdev\n"); + return NULL; + } + + rc = spdk_bdev_open(bdev, true, remove_cb, remove_ctx, &desc); + if (rc != 0) { + free(b); + return NULL; + } + + b->bdev = bdev; + b->desc = desc; + b->bs_dev.blockcnt = spdk_bdev_get_num_blocks(bdev); + b->bs_dev.blocklen = spdk_bdev_get_block_size(bdev); + b->bs_dev.create_channel = bdev_blob_create_channel; + b->bs_dev.destroy_channel = bdev_blob_destroy_channel; + b->bs_dev.destroy = bdev_blob_destroy; + b->bs_dev.read = bdev_blob_read; + b->bs_dev.write = bdev_blob_write; + b->bs_dev.readv = bdev_blob_readv; + b->bs_dev.writev = bdev_blob_writev; + b->bs_dev.write_zeroes = bdev_blob_write_zeroes; + b->bs_dev.unmap = bdev_blob_unmap; + + return &b->bs_dev; +} diff --git a/src/spdk/lib/blob/blob_bs_dev.c b/src/spdk/lib/blob/blob_bs_dev.c new file mode 100644 index 00000000..91084651 --- /dev/null +++ b/src/spdk/lib/blob/blob_bs_dev.c @@ -0,0 +1,150 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/blob.h" +#include "spdk/log.h" +#include "blobstore.h" + +static void +blob_bs_dev_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +blob_bs_dev_read_cpl(void *cb_arg, int bserrno) +{ + struct spdk_bs_dev_cb_args *cb_args = (struct spdk_bs_dev_cb_args *)cb_arg; + + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, bserrno); +} + +static inline void +blob_bs_dev_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev; + + spdk_blob_io_read(b->blob, channel, payload, lba, lba_count, + blob_bs_dev_read_cpl, cb_args); +} + +static inline void +blob_bs_dev_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)dev; + + spdk_blob_io_readv(b->blob, channel, iov, iovcnt, lba, lba_count, + blob_bs_dev_read_cpl, cb_args); +} + +static void +blob_bs_dev_destroy_cpl(void *cb_arg, int bserrno) +{ + if (bserrno != 0) { + SPDK_ERRLOG("Error on blob_bs_dev destroy: %d", bserrno); + } + + /* Free blob_bs_dev */ + free(cb_arg); +} + +static void +blob_bs_dev_destroy(struct spdk_bs_dev *bs_dev) +{ + struct spdk_blob_bs_dev *b = (struct spdk_blob_bs_dev *)bs_dev; + + spdk_blob_close(b->blob, blob_bs_dev_destroy_cpl, b); +} + + +struct spdk_bs_dev * +spdk_bs_create_blob_bs_dev(struct spdk_blob *blob) +{ + struct spdk_blob_bs_dev *b; + + b = calloc(1, sizeof(*b)); + if (b == NULL) { + return NULL; + } + /* snapshot blob */ + b->bs_dev.blockcnt = blob->active.num_clusters * + blob->bs->pages_per_cluster * _spdk_bs_io_unit_per_page(blob->bs); + b->bs_dev.blocklen = spdk_bs_get_io_unit_size(blob->bs); + b->bs_dev.create_channel = NULL; + b->bs_dev.destroy_channel = NULL; + b->bs_dev.destroy = blob_bs_dev_destroy; + b->bs_dev.write = blob_bs_dev_write; + b->bs_dev.writev = blob_bs_dev_writev; + b->bs_dev.read = blob_bs_dev_read; + b->bs_dev.readv = blob_bs_dev_readv; + b->bs_dev.write_zeroes = blob_bs_dev_write_zeroes; + b->bs_dev.unmap = blob_bs_dev_unmap; + b->blob = blob; + + return &b->bs_dev; +} diff --git a/src/spdk/lib/blob/blobstore.c b/src/spdk/lib/blob/blobstore.c new file mode 100644 index 00000000..3b294180 --- /dev/null +++ b/src/spdk/lib/blob/blobstore.c @@ -0,0 +1,5720 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blob.h" +#include "spdk/crc32.h" +#include "spdk/env.h" +#include "spdk/queue.h" +#include "spdk/thread.h" +#include "spdk/bit_array.h" +#include "spdk/likely.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +#include "blobstore.h" + +#define BLOB_CRC32C_INITIAL 0xffffffffUL + +static int spdk_bs_register_md_thread(struct spdk_blob_store *bs); +static int spdk_bs_unregister_md_thread(struct spdk_blob_store *bs); +static void _spdk_blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno); +static void _spdk_blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t cluster, spdk_blob_op_complete cb_fn, void *cb_arg); + +static int _spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len, bool internal); +static int _spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len, bool internal); +static int _spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal); + +static void +_spdk_blob_verify_md_op(struct spdk_blob *blob) +{ + assert(blob != NULL); + assert(spdk_get_thread() == blob->bs->md_thread); + assert(blob->state != SPDK_BLOB_STATE_LOADING); +} + +static inline size_t +divide_round_up(size_t num, size_t divisor) +{ + return (num + divisor - 1) / divisor; +} + +static void +_spdk_bs_claim_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +{ + assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); + assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == false); + assert(bs->num_free_clusters > 0); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %u\n", cluster_num); + + spdk_bit_array_set(bs->used_clusters, cluster_num); + bs->num_free_clusters--; +} + +static int +_spdk_blob_insert_cluster(struct spdk_blob *blob, uint32_t cluster_num, uint64_t cluster) +{ + uint64_t *cluster_lba = &blob->active.clusters[cluster_num]; + + _spdk_blob_verify_md_op(blob); + + if (*cluster_lba != 0) { + return -EEXIST; + } + + *cluster_lba = _spdk_bs_cluster_to_lba(blob->bs, cluster); + return 0; +} + +static int +_spdk_bs_allocate_cluster(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t *lowest_free_cluster, bool update_map) +{ + pthread_mutex_lock(&blob->bs->used_clusters_mutex); + *lowest_free_cluster = spdk_bit_array_find_first_clear(blob->bs->used_clusters, + *lowest_free_cluster); + if (*lowest_free_cluster == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + return -ENOSPC; + } + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming cluster %lu for blob %lu\n", *lowest_free_cluster, blob->id); + _spdk_bs_claim_cluster(blob->bs, *lowest_free_cluster); + pthread_mutex_unlock(&blob->bs->used_clusters_mutex); + + if (update_map) { + _spdk_blob_insert_cluster(blob, cluster_num, *lowest_free_cluster); + } + + return 0; +} + +static void +_spdk_bs_release_cluster(struct spdk_blob_store *bs, uint32_t cluster_num) +{ + assert(cluster_num < spdk_bit_array_capacity(bs->used_clusters)); + assert(spdk_bit_array_get(bs->used_clusters, cluster_num) == true); + assert(bs->num_free_clusters < bs->total_clusters); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Releasing cluster %u\n", cluster_num); + + pthread_mutex_lock(&bs->used_clusters_mutex); + spdk_bit_array_clear(bs->used_clusters, cluster_num); + bs->num_free_clusters++; + pthread_mutex_unlock(&bs->used_clusters_mutex); +} + +static void +_spdk_blob_xattrs_init(struct spdk_blob_xattr_opts *xattrs) +{ + xattrs->count = 0; + xattrs->names = NULL; + xattrs->ctx = NULL; + xattrs->get_value = NULL; +} + +void +spdk_blob_opts_init(struct spdk_blob_opts *opts) +{ + opts->num_clusters = 0; + opts->thin_provision = false; + _spdk_blob_xattrs_init(&opts->xattrs); +} + +static struct spdk_blob * +_spdk_blob_alloc(struct spdk_blob_store *bs, spdk_blob_id id) +{ + struct spdk_blob *blob; + + blob = calloc(1, sizeof(*blob)); + if (!blob) { + return NULL; + } + + blob->id = id; + blob->bs = bs; + + blob->parent_id = SPDK_BLOBID_INVALID; + + blob->state = SPDK_BLOB_STATE_DIRTY; + blob->active.num_pages = 1; + blob->active.pages = calloc(1, sizeof(*blob->active.pages)); + if (!blob->active.pages) { + free(blob); + return NULL; + } + + blob->active.pages[0] = _spdk_bs_blobid_to_page(id); + + TAILQ_INIT(&blob->xattrs); + TAILQ_INIT(&blob->xattrs_internal); + + return blob; +} + +static void +_spdk_xattrs_free(struct spdk_xattr_tailq *xattrs) +{ + struct spdk_xattr *xattr, *xattr_tmp; + + TAILQ_FOREACH_SAFE(xattr, xattrs, link, xattr_tmp) { + TAILQ_REMOVE(xattrs, xattr, link); + free(xattr->name); + free(xattr->value); + free(xattr); + } +} + +static void +_spdk_blob_free(struct spdk_blob *blob) +{ + assert(blob != NULL); + + free(blob->active.clusters); + free(blob->clean.clusters); + free(blob->active.pages); + free(blob->clean.pages); + + _spdk_xattrs_free(&blob->xattrs); + _spdk_xattrs_free(&blob->xattrs_internal); + + if (blob->back_bs_dev) { + blob->back_bs_dev->destroy(blob->back_bs_dev); + } + + free(blob); +} + +struct freeze_io_ctx { + struct spdk_bs_cpl cpl; + struct spdk_blob *blob; +}; + +static void +_spdk_blob_io_sync(struct spdk_io_channel_iter *i) +{ + spdk_for_each_channel_continue(i, 0); +} + +static void +_spdk_blob_execute_queued_io(struct spdk_io_channel_iter *i) +{ + struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); + struct spdk_bs_channel *ch = spdk_io_channel_get_ctx(_ch); + struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + spdk_bs_user_op_t *op, *tmp; + + TAILQ_FOREACH_SAFE(op, &ch->queued_io, link, tmp) { + set = (struct spdk_bs_request_set *)op; + args = &set->u.user_op; + + if (args->blob == ctx->blob) { + TAILQ_REMOVE(&ch->queued_io, op, link); + spdk_bs_user_op_execute(op); + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static void +_spdk_blob_io_cpl(struct spdk_io_channel_iter *i, int status) +{ + struct freeze_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cpl.u.blob_basic.cb_fn(ctx->cpl.u.blob_basic.cb_arg, 0); + + free(ctx); +} + +static void +_spdk_blob_freeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct freeze_io_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + ctx->cpl.u.blob_basic.cb_fn = cb_fn; + ctx->cpl.u.blob_basic.cb_arg = cb_arg; + ctx->blob = blob; + + /* Freeze I/O on blob */ + blob->frozen_refcnt++; + + if (blob->frozen_refcnt == 1) { + spdk_for_each_channel(blob->bs, _spdk_blob_io_sync, ctx, _spdk_blob_io_cpl); + } else { + cb_fn(cb_arg, 0); + free(ctx); + } +} + +static void +_spdk_blob_unfreeze_io(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct freeze_io_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + ctx->cpl.u.blob_basic.cb_fn = cb_fn; + ctx->cpl.u.blob_basic.cb_arg = cb_arg; + ctx->blob = blob; + + assert(blob->frozen_refcnt > 0); + + blob->frozen_refcnt--; + + if (blob->frozen_refcnt == 0) { + spdk_for_each_channel(blob->bs, _spdk_blob_execute_queued_io, ctx, _spdk_blob_io_cpl); + } else { + cb_fn(cb_arg, 0); + free(ctx); + } +} + +static int +_spdk_blob_mark_clean(struct spdk_blob *blob) +{ + uint64_t *clusters = NULL; + uint32_t *pages = NULL; + + assert(blob != NULL); + + if (blob->active.num_clusters) { + assert(blob->active.clusters); + clusters = calloc(blob->active.num_clusters, sizeof(*blob->active.clusters)); + if (!clusters) { + return -ENOMEM; + } + memcpy(clusters, blob->active.clusters, blob->active.num_clusters * sizeof(*clusters)); + } + + if (blob->active.num_pages) { + assert(blob->active.pages); + pages = calloc(blob->active.num_pages, sizeof(*blob->active.pages)); + if (!pages) { + free(clusters); + return -ENOMEM; + } + memcpy(pages, blob->active.pages, blob->active.num_pages * sizeof(*pages)); + } + + free(blob->clean.clusters); + free(blob->clean.pages); + + blob->clean.num_clusters = blob->active.num_clusters; + blob->clean.clusters = blob->active.clusters; + blob->clean.num_pages = blob->active.num_pages; + blob->clean.pages = blob->active.pages; + + blob->active.clusters = clusters; + blob->active.pages = pages; + + /* If the metadata was dirtied again while the metadata was being written to disk, + * we do not want to revert the DIRTY state back to CLEAN here. + */ + if (blob->state == SPDK_BLOB_STATE_LOADING) { + blob->state = SPDK_BLOB_STATE_CLEAN; + } + + return 0; +} + +static int +_spdk_blob_deserialize_xattr(struct spdk_blob *blob, + struct spdk_blob_md_descriptor_xattr *desc_xattr, bool internal) +{ + struct spdk_xattr *xattr; + + if (desc_xattr->length != sizeof(desc_xattr->name_length) + + sizeof(desc_xattr->value_length) + + desc_xattr->name_length + desc_xattr->value_length) { + return -EINVAL; + } + + xattr = calloc(1, sizeof(*xattr)); + if (xattr == NULL) { + return -ENOMEM; + } + + xattr->name = malloc(desc_xattr->name_length + 1); + if (xattr->name == NULL) { + free(xattr); + return -ENOMEM; + } + memcpy(xattr->name, desc_xattr->name, desc_xattr->name_length); + xattr->name[desc_xattr->name_length] = '\0'; + + xattr->value = malloc(desc_xattr->value_length); + if (xattr->value == NULL) { + free(xattr->name); + free(xattr); + return -ENOMEM; + } + xattr->value_len = desc_xattr->value_length; + memcpy(xattr->value, + (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), + desc_xattr->value_length); + + TAILQ_INSERT_TAIL(internal ? &blob->xattrs_internal : &blob->xattrs, xattr, link); + + return 0; +} + + +static int +_spdk_blob_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob *blob) +{ + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + void *tmp; + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + struct spdk_blob_md_descriptor_flags *desc_flags; + + desc_flags = (struct spdk_blob_md_descriptor_flags *)desc; + + if (desc_flags->length != sizeof(*desc_flags) - sizeof(*desc)) { + return -EINVAL; + } + + if ((desc_flags->invalid_flags | SPDK_BLOB_INVALID_FLAGS_MASK) != + SPDK_BLOB_INVALID_FLAGS_MASK) { + return -EINVAL; + } + + if ((desc_flags->data_ro_flags | SPDK_BLOB_DATA_RO_FLAGS_MASK) != + SPDK_BLOB_DATA_RO_FLAGS_MASK) { + blob->data_ro = true; + blob->md_ro = true; + } + + if ((desc_flags->md_ro_flags | SPDK_BLOB_MD_RO_FLAGS_MASK) != + SPDK_BLOB_MD_RO_FLAGS_MASK) { + blob->md_ro = true; + } + + if ((desc_flags->data_ro_flags & SPDK_BLOB_READ_ONLY)) { + blob->data_ro = true; + blob->md_ro = true; + } + + blob->invalid_flags = desc_flags->invalid_flags; + blob->data_ro_flags = desc_flags->data_ro_flags; + blob->md_ro_flags = desc_flags->md_ro_flags; + + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT) { + struct spdk_blob_md_descriptor_extent *desc_extent; + unsigned int i, j; + unsigned int cluster_count = blob->active.num_clusters; + + desc_extent = (struct spdk_blob_md_descriptor_extent *)desc; + + if (desc_extent->length == 0 || + (desc_extent->length % sizeof(desc_extent->extents[0]) != 0)) { + return -EINVAL; + } + + for (i = 0; i < desc_extent->length / sizeof(desc_extent->extents[0]); i++) { + for (j = 0; j < desc_extent->extents[i].length; j++) { + if (desc_extent->extents[i].cluster_idx != 0) { + if (!spdk_bit_array_get(blob->bs->used_clusters, + desc_extent->extents[i].cluster_idx + j)) { + return -EINVAL; + } + } + cluster_count++; + } + } + + if (cluster_count == 0) { + return -EINVAL; + } + tmp = realloc(blob->active.clusters, cluster_count * sizeof(uint64_t)); + if (tmp == NULL) { + return -ENOMEM; + } + blob->active.clusters = tmp; + blob->active.cluster_array_size = cluster_count; + + for (i = 0; i < desc_extent->length / sizeof(desc_extent->extents[0]); i++) { + for (j = 0; j < desc_extent->extents[i].length; j++) { + if (desc_extent->extents[i].cluster_idx != 0) { + blob->active.clusters[blob->active.num_clusters++] = _spdk_bs_cluster_to_lba(blob->bs, + desc_extent->extents[i].cluster_idx + j); + } else if (spdk_blob_is_thin_provisioned(blob)) { + blob->active.clusters[blob->active.num_clusters++] = 0; + } else { + return -EINVAL; + } + } + } + + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + int rc; + + rc = _spdk_blob_deserialize_xattr(blob, + (struct spdk_blob_md_descriptor_xattr *) desc, false); + if (rc != 0) { + return rc; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + int rc; + + rc = _spdk_blob_deserialize_xattr(blob, + (struct spdk_blob_md_descriptor_xattr *) desc, true); + if (rc != 0) { + return rc; + } + } else { + /* Unrecognized descriptor type. Do not fail - just continue to the + * next descriptor. If this descriptor is associated with some feature + * defined in a newer version of blobstore, that version of blobstore + * should create and set an associated feature flag to specify if this + * blob can be loaded or not. + */ + } + + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } + + return 0; +} + +static int +_spdk_blob_parse(const struct spdk_blob_md_page *pages, uint32_t page_count, + struct spdk_blob *blob) +{ + const struct spdk_blob_md_page *page; + uint32_t i; + int rc; + + assert(page_count > 0); + assert(pages[0].sequence_num == 0); + assert(blob != NULL); + assert(blob->state == SPDK_BLOB_STATE_LOADING); + assert(blob->active.clusters == NULL); + + /* The blobid provided doesn't match what's in the MD, this can + * happen for example if a bogus blobid is passed in through open. + */ + if (blob->id != pages[0].id) { + SPDK_ERRLOG("Blobid (%lu) doesn't match what's in metadata (%lu)\n", + blob->id, pages[0].id); + return -ENOENT; + } + + for (i = 0; i < page_count; i++) { + page = &pages[i]; + + assert(page->id == blob->id); + assert(page->sequence_num == i); + + rc = _spdk_blob_parse_page(page, blob); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static int +_spdk_blob_serialize_add_page(const struct spdk_blob *blob, + struct spdk_blob_md_page **pages, + uint32_t *page_count, + struct spdk_blob_md_page **last_page) +{ + struct spdk_blob_md_page *page; + + assert(pages != NULL); + assert(page_count != NULL); + + if (*page_count == 0) { + assert(*pages == NULL); + *page_count = 1; + *pages = spdk_dma_malloc(SPDK_BS_PAGE_SIZE, + SPDK_BS_PAGE_SIZE, + NULL); + } else { + assert(*pages != NULL); + (*page_count)++; + *pages = spdk_dma_realloc(*pages, + SPDK_BS_PAGE_SIZE * (*page_count), + SPDK_BS_PAGE_SIZE, + NULL); + } + + if (*pages == NULL) { + *page_count = 0; + *last_page = NULL; + return -ENOMEM; + } + + page = &(*pages)[*page_count - 1]; + memset(page, 0, sizeof(*page)); + page->id = blob->id; + page->sequence_num = *page_count - 1; + page->next = SPDK_INVALID_MD_PAGE; + *last_page = page; + + return 0; +} + +/* Transform the in-memory representation 'xattr' into an on-disk xattr descriptor. + * Update required_sz on both success and failure. + * + */ +static int +_spdk_blob_serialize_xattr(const struct spdk_xattr *xattr, + uint8_t *buf, size_t buf_sz, + size_t *required_sz, bool internal) +{ + struct spdk_blob_md_descriptor_xattr *desc; + + *required_sz = sizeof(struct spdk_blob_md_descriptor_xattr) + + strlen(xattr->name) + + xattr->value_len; + + if (buf_sz < *required_sz) { + return -1; + } + + desc = (struct spdk_blob_md_descriptor_xattr *)buf; + + desc->type = internal ? SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL : SPDK_MD_DESCRIPTOR_TYPE_XATTR; + desc->length = sizeof(desc->name_length) + + sizeof(desc->value_length) + + strlen(xattr->name) + + xattr->value_len; + desc->name_length = strlen(xattr->name); + desc->value_length = xattr->value_len; + + memcpy(desc->name, xattr->name, desc->name_length); + memcpy((void *)((uintptr_t)desc->name + desc->name_length), + xattr->value, + desc->value_length); + + return 0; +} + +static void +_spdk_blob_serialize_extent(const struct spdk_blob *blob, + uint64_t start_cluster, uint64_t *next_cluster, + uint8_t *buf, size_t buf_sz) +{ + struct spdk_blob_md_descriptor_extent *desc; + size_t cur_sz; + uint64_t i, extent_idx; + uint64_t lba, lba_per_cluster, lba_count; + + /* The buffer must have room for at least one extent */ + cur_sz = sizeof(struct spdk_blob_md_descriptor) + sizeof(desc->extents[0]); + if (buf_sz < cur_sz) { + *next_cluster = start_cluster; + return; + } + + desc = (struct spdk_blob_md_descriptor_extent *)buf; + desc->type = SPDK_MD_DESCRIPTOR_TYPE_EXTENT; + + lba_per_cluster = _spdk_bs_cluster_to_lba(blob->bs, 1); + + lba = blob->active.clusters[start_cluster]; + lba_count = lba_per_cluster; + extent_idx = 0; + for (i = start_cluster + 1; i < blob->active.num_clusters; i++) { + if ((lba + lba_count) == blob->active.clusters[i]) { + lba_count += lba_per_cluster; + continue; + } else if (lba == 0 && blob->active.clusters[i] == 0) { + lba_count += lba_per_cluster; + continue; + } + desc->extents[extent_idx].cluster_idx = lba / lba_per_cluster; + desc->extents[extent_idx].length = lba_count / lba_per_cluster; + extent_idx++; + + cur_sz += sizeof(desc->extents[extent_idx]); + + if (buf_sz < cur_sz) { + /* If we ran out of buffer space, return */ + desc->length = sizeof(desc->extents[0]) * extent_idx; + *next_cluster = i; + return; + } + + lba = blob->active.clusters[i]; + lba_count = lba_per_cluster; + } + + desc->extents[extent_idx].cluster_idx = lba / lba_per_cluster; + desc->extents[extent_idx].length = lba_count / lba_per_cluster; + extent_idx++; + + desc->length = sizeof(desc->extents[0]) * extent_idx; + *next_cluster = blob->active.num_clusters; + + return; +} + +static void +_spdk_blob_serialize_flags(const struct spdk_blob *blob, + uint8_t *buf, size_t *buf_sz) +{ + struct spdk_blob_md_descriptor_flags *desc; + + /* + * Flags get serialized first, so we should always have room for the flags + * descriptor. + */ + assert(*buf_sz >= sizeof(*desc)); + + desc = (struct spdk_blob_md_descriptor_flags *)buf; + desc->type = SPDK_MD_DESCRIPTOR_TYPE_FLAGS; + desc->length = sizeof(*desc) - sizeof(struct spdk_blob_md_descriptor); + desc->invalid_flags = blob->invalid_flags; + desc->data_ro_flags = blob->data_ro_flags; + desc->md_ro_flags = blob->md_ro_flags; + + *buf_sz -= sizeof(*desc); +} + +static int +_spdk_blob_serialize_xattrs(const struct spdk_blob *blob, + const struct spdk_xattr_tailq *xattrs, bool internal, + struct spdk_blob_md_page **pages, + struct spdk_blob_md_page *cur_page, + uint32_t *page_count, uint8_t **buf, + size_t *remaining_sz) +{ + const struct spdk_xattr *xattr; + int rc; + + TAILQ_FOREACH(xattr, xattrs, link) { + size_t required_sz = 0; + + rc = _spdk_blob_serialize_xattr(xattr, + *buf, *remaining_sz, + &required_sz, internal); + if (rc < 0) { + /* Need to add a new page to the chain */ + rc = _spdk_blob_serialize_add_page(blob, pages, page_count, + &cur_page); + if (rc < 0) { + spdk_dma_free(*pages); + *pages = NULL; + *page_count = 0; + return rc; + } + + *buf = (uint8_t *)cur_page->descriptors; + *remaining_sz = sizeof(cur_page->descriptors); + + /* Try again */ + required_sz = 0; + rc = _spdk_blob_serialize_xattr(xattr, + *buf, *remaining_sz, + &required_sz, internal); + + if (rc < 0) { + spdk_dma_free(*pages); + *pages = NULL; + *page_count = 0; + return rc; + } + } + + *remaining_sz -= required_sz; + *buf += required_sz; + } + + return 0; +} + +static int +_spdk_blob_serialize(const struct spdk_blob *blob, struct spdk_blob_md_page **pages, + uint32_t *page_count) +{ + struct spdk_blob_md_page *cur_page; + int rc; + uint8_t *buf; + size_t remaining_sz; + uint64_t last_cluster; + + assert(pages != NULL); + assert(page_count != NULL); + assert(blob != NULL); + assert(blob->state == SPDK_BLOB_STATE_DIRTY); + + *pages = NULL; + *page_count = 0; + + /* A blob always has at least 1 page, even if it has no descriptors */ + rc = _spdk_blob_serialize_add_page(blob, pages, page_count, &cur_page); + if (rc < 0) { + return rc; + } + + buf = (uint8_t *)cur_page->descriptors; + remaining_sz = sizeof(cur_page->descriptors); + + /* Serialize flags */ + _spdk_blob_serialize_flags(blob, buf, &remaining_sz); + buf += sizeof(struct spdk_blob_md_descriptor_flags); + + /* Serialize xattrs */ + rc = _spdk_blob_serialize_xattrs(blob, &blob->xattrs, false, + pages, cur_page, page_count, &buf, &remaining_sz); + if (rc < 0) { + return rc; + } + + /* Serialize internal xattrs */ + rc = _spdk_blob_serialize_xattrs(blob, &blob->xattrs_internal, true, + pages, cur_page, page_count, &buf, &remaining_sz); + if (rc < 0) { + return rc; + } + + /* Serialize extents */ + last_cluster = 0; + while (last_cluster < blob->active.num_clusters) { + _spdk_blob_serialize_extent(blob, last_cluster, &last_cluster, + buf, remaining_sz); + + if (last_cluster == blob->active.num_clusters) { + break; + } + + rc = _spdk_blob_serialize_add_page(blob, pages, page_count, + &cur_page); + if (rc < 0) { + return rc; + } + + buf = (uint8_t *)cur_page->descriptors; + remaining_sz = sizeof(cur_page->descriptors); + } + + return 0; +} + +struct spdk_blob_load_ctx { + struct spdk_blob *blob; + + struct spdk_blob_md_page *pages; + uint32_t num_pages; + spdk_bs_sequence_t *seq; + + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; +}; + +static uint32_t +_spdk_blob_md_page_calc_crc(void *page) +{ + uint32_t crc; + + crc = BLOB_CRC32C_INITIAL; + crc = spdk_crc32c_update(page, SPDK_BS_PAGE_SIZE - 4, crc); + crc ^= BLOB_CRC32C_INITIAL; + + return crc; + +} + +static void +_spdk_blob_load_final(void *cb_arg, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + + _spdk_blob_mark_clean(blob); + + ctx->cb_fn(ctx->seq, ctx->cb_arg, bserrno); + + /* Free the memory */ + spdk_dma_free(ctx->pages); + free(ctx); +} + +static void +_spdk_blob_load_snapshot_cpl(void *cb_arg, struct spdk_blob *snapshot, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + + if (bserrno != 0) { + goto error; + } + + blob->back_bs_dev = spdk_bs_create_blob_bs_dev(snapshot); + + if (blob->back_bs_dev == NULL) { + bserrno = -ENOMEM; + goto error; + } + + _spdk_blob_load_final(ctx, bserrno); + return; + +error: + SPDK_ERRLOG("Snapshot fail\n"); + _spdk_blob_free(blob); + ctx->cb_fn(ctx->seq, NULL, bserrno); + spdk_dma_free(ctx->pages); + free(ctx); +} + +static void +_spdk_blob_load_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_load_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_md_page *page; + const void *value; + size_t len; + int rc; + uint32_t crc; + + page = &ctx->pages[ctx->num_pages - 1]; + crc = _spdk_blob_md_page_calc_crc(page); + if (crc != page->crc) { + SPDK_ERRLOG("Metadata page %d crc mismatch\n", ctx->num_pages); + _spdk_blob_free(blob); + ctx->cb_fn(seq, NULL, -EINVAL); + spdk_dma_free(ctx->pages); + free(ctx); + return; + } + + if (page->next != SPDK_INVALID_MD_PAGE) { + uint32_t next_page = page->next; + uint64_t next_lba = _spdk_bs_page_to_lba(blob->bs, blob->bs->md_start + next_page); + + + assert(next_lba < (blob->bs->md_start + blob->bs->md_len)); + + /* Read the next page */ + ctx->num_pages++; + ctx->pages = spdk_dma_realloc(ctx->pages, (sizeof(*page) * ctx->num_pages), + sizeof(*page), NULL); + if (ctx->pages == NULL) { + ctx->cb_fn(seq, ctx->cb_arg, -ENOMEM); + free(ctx); + return; + } + + spdk_bs_sequence_read_dev(seq, &ctx->pages[ctx->num_pages - 1], + next_lba, + _spdk_bs_byte_to_lba(blob->bs, sizeof(*page)), + _spdk_blob_load_cpl, ctx); + return; + } + + /* Parse the pages */ + rc = _spdk_blob_parse(ctx->pages, ctx->num_pages, blob); + if (rc) { + _spdk_blob_free(blob); + ctx->cb_fn(seq, NULL, rc); + spdk_dma_free(ctx->pages); + free(ctx); + return; + } + ctx->seq = seq; + + + if (spdk_blob_is_thin_provisioned(blob)) { + rc = _spdk_blob_get_xattr_value(blob, BLOB_SNAPSHOT, &value, &len, true); + if (rc == 0) { + if (len != sizeof(spdk_blob_id)) { + _spdk_blob_free(blob); + ctx->cb_fn(seq, NULL, -EINVAL); + spdk_dma_free(ctx->pages); + free(ctx); + return; + } + /* open snapshot blob and continue in the callback function */ + blob->parent_id = *(spdk_blob_id *)value; + spdk_bs_open_blob(blob->bs, blob->parent_id, + _spdk_blob_load_snapshot_cpl, ctx); + return; + } else { + /* add zeroes_dev for thin provisioned blob */ + blob->back_bs_dev = spdk_bs_create_zeroes_dev(); + } + } else { + /* standard blob */ + blob->back_bs_dev = NULL; + } + _spdk_blob_load_final(ctx, bserrno); +} + +/* Load a blob from disk given a blobid */ +static void +_spdk_blob_load(spdk_bs_sequence_t *seq, struct spdk_blob *blob, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_blob_load_ctx *ctx; + struct spdk_blob_store *bs; + uint32_t page_num; + uint64_t lba; + + _spdk_blob_verify_md_op(blob); + + bs = blob->bs; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->pages = spdk_dma_realloc(ctx->pages, SPDK_BS_PAGE_SIZE, + SPDK_BS_PAGE_SIZE, NULL); + if (!ctx->pages) { + free(ctx); + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + ctx->num_pages = 1; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + page_num = _spdk_bs_blobid_to_page(blob->id); + lba = _spdk_bs_page_to_lba(blob->bs, bs->md_start + page_num); + + blob->state = SPDK_BLOB_STATE_LOADING; + + spdk_bs_sequence_read_dev(seq, &ctx->pages[0], lba, + _spdk_bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE), + _spdk_blob_load_cpl, ctx); +} + +struct spdk_blob_persist_ctx { + struct spdk_blob *blob; + + struct spdk_bs_super_block *super; + + struct spdk_blob_md_page *pages; + + uint64_t idx; + + spdk_bs_sequence_t *seq; + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; +}; + +static void +_spdk_blob_persist_complete(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + + if (bserrno == 0) { + _spdk_blob_mark_clean(blob); + } + + /* Call user callback */ + ctx->cb_fn(seq, ctx->cb_arg, bserrno); + + /* Free the memory */ + spdk_dma_free(ctx->pages); + free(ctx); +} + +static void +_spdk_blob_persist_unmap_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + void *tmp; + size_t i; + + /* Release all clusters that were truncated */ + for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { + uint32_t cluster_num = _spdk_bs_lba_to_cluster(bs, blob->active.clusters[i]); + + /* Nothing to release if it was not allocated */ + if (blob->active.clusters[i] != 0) { + _spdk_bs_release_cluster(bs, cluster_num); + } + } + + if (blob->active.num_clusters == 0) { + free(blob->active.clusters); + blob->active.clusters = NULL; + blob->active.cluster_array_size = 0; + } else { + tmp = realloc(blob->active.clusters, sizeof(uint64_t) * blob->active.num_clusters); + assert(tmp != NULL); + blob->active.clusters = tmp; + blob->active.cluster_array_size = blob->active.num_clusters; + } + + _spdk_blob_persist_complete(seq, ctx, bserrno); +} + +static void +_spdk_blob_persist_unmap_clusters(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + spdk_bs_batch_t *batch; + size_t i; + uint64_t lba; + uint32_t lba_count; + + /* Clusters don't move around in blobs. The list shrinks or grows + * at the end, but no changes ever occur in the middle of the list. + */ + + batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_unmap_clusters_cpl, ctx); + + /* Unmap all clusters that were truncated */ + lba = 0; + lba_count = 0; + for (i = blob->active.num_clusters; i < blob->active.cluster_array_size; i++) { + uint64_t next_lba = blob->active.clusters[i]; + uint32_t next_lba_count = _spdk_bs_cluster_to_lba(bs, 1); + + if (next_lba > 0 && (lba + lba_count) == next_lba) { + /* This cluster is contiguous with the previous one. */ + lba_count += next_lba_count; + continue; + } + + /* This cluster is not contiguous with the previous one. */ + + /* If a run of LBAs previously existing, send them + * as an unmap. + */ + if (lba_count > 0) { + spdk_bs_batch_unmap_dev(batch, lba, lba_count); + } + + /* Start building the next batch */ + lba = next_lba; + if (next_lba > 0) { + lba_count = next_lba_count; + } else { + lba_count = 0; + } + } + + /* If we ended with a contiguous set of LBAs, send the unmap now */ + if (lba_count > 0) { + spdk_bs_batch_unmap_dev(batch, lba, lba_count); + } + + spdk_bs_batch_close(batch); +} + +static void +_spdk_blob_persist_zero_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + size_t i; + + /* This loop starts at 1 because the first page is special and handled + * below. The pages (except the first) are never written in place, + * so any pages in the clean list must be zeroed. + */ + for (i = 1; i < blob->clean.num_pages; i++) { + spdk_bit_array_clear(bs->used_md_pages, blob->clean.pages[i]); + } + + if (blob->active.num_pages == 0) { + uint32_t page_num; + + page_num = _spdk_bs_blobid_to_page(blob->id); + spdk_bit_array_clear(bs->used_md_pages, page_num); + } + + /* Move on to unmapping clusters */ + _spdk_blob_persist_unmap_clusters(seq, ctx, 0); +} + +static void +_spdk_blob_persist_zero_pages(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + spdk_bs_batch_t *batch; + size_t i; + + batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_zero_pages_cpl, ctx); + + lba_count = _spdk_bs_byte_to_lba(bs, SPDK_BS_PAGE_SIZE); + + /* This loop starts at 1 because the first page is special and handled + * below. The pages (except the first) are never written in place, + * so any pages in the clean list must be zeroed. + */ + for (i = 1; i < blob->clean.num_pages; i++) { + lba = _spdk_bs_page_to_lba(bs, bs->md_start + blob->clean.pages[i]); + + spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + /* The first page will only be zeroed if this is a delete. */ + if (blob->active.num_pages == 0) { + uint32_t page_num; + + /* The first page in the metadata goes where the blobid indicates */ + page_num = _spdk_bs_blobid_to_page(blob->id); + lba = _spdk_bs_page_to_lba(bs, bs->md_start + page_num); + + spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + spdk_bs_batch_close(batch); +} + +static void +_spdk_blob_persist_write_page_root(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + struct spdk_blob_md_page *page; + + if (blob->active.num_pages == 0) { + /* Move on to the next step */ + _spdk_blob_persist_zero_pages(seq, ctx, 0); + return; + } + + lba_count = _spdk_bs_byte_to_lba(bs, sizeof(*page)); + + page = &ctx->pages[0]; + /* The first page in the metadata goes where the blobid indicates */ + lba = _spdk_bs_page_to_lba(bs, bs->md_start + _spdk_bs_blobid_to_page(blob->id)); + + spdk_bs_sequence_write_dev(seq, page, lba, lba_count, + _spdk_blob_persist_zero_pages, ctx); +} + +static void +_spdk_blob_persist_write_page_chain(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t lba; + uint32_t lba_count; + struct spdk_blob_md_page *page; + spdk_bs_batch_t *batch; + size_t i; + + /* Clusters don't move around in blobs. The list shrinks or grows + * at the end, but no changes ever occur in the middle of the list. + */ + + lba_count = _spdk_bs_byte_to_lba(bs, sizeof(*page)); + + batch = spdk_bs_sequence_to_batch(seq, _spdk_blob_persist_write_page_root, ctx); + + /* This starts at 1. The root page is not written until + * all of the others are finished + */ + for (i = 1; i < blob->active.num_pages; i++) { + page = &ctx->pages[i]; + assert(page->sequence_num == i); + + lba = _spdk_bs_page_to_lba(bs, bs->md_start + blob->active.pages[i]); + + spdk_bs_batch_write_dev(batch, page, lba, lba_count); + } + + spdk_bs_batch_close(batch); +} + +static int +_spdk_blob_resize(struct spdk_blob *blob, uint64_t sz) +{ + uint64_t i; + uint64_t *tmp; + uint64_t lfc; /* lowest free cluster */ + uint64_t num_clusters; + struct spdk_blob_store *bs; + + bs = blob->bs; + + _spdk_blob_verify_md_op(blob); + + if (blob->active.num_clusters == sz) { + return 0; + } + + if (blob->active.num_clusters < blob->active.cluster_array_size) { + /* If this blob was resized to be larger, then smaller, then + * larger without syncing, then the cluster array already + * contains spare assigned clusters we can use. + */ + num_clusters = spdk_min(blob->active.cluster_array_size, + sz); + } else { + num_clusters = blob->active.num_clusters; + } + + /* Do two passes - one to verify that we can obtain enough clusters + * and another to actually claim them. + */ + + if (spdk_blob_is_thin_provisioned(blob) == false) { + lfc = 0; + for (i = num_clusters; i < sz; i++) { + lfc = spdk_bit_array_find_first_clear(bs->used_clusters, lfc); + if (lfc == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + return -ENOSPC; + } + lfc++; + } + } + + if (sz > num_clusters) { + /* Expand the cluster array if necessary. + * We only shrink the array when persisting. + */ + tmp = realloc(blob->active.clusters, sizeof(uint64_t) * sz); + if (sz > 0 && tmp == NULL) { + return -ENOMEM; + } + memset(tmp + blob->active.cluster_array_size, 0, + sizeof(uint64_t) * (sz - blob->active.cluster_array_size)); + blob->active.clusters = tmp; + blob->active.cluster_array_size = sz; + } + + blob->state = SPDK_BLOB_STATE_DIRTY; + + if (spdk_blob_is_thin_provisioned(blob) == false) { + lfc = 0; + for (i = num_clusters; i < sz; i++) { + _spdk_bs_allocate_cluster(blob, i, &lfc, true); + lfc++; + } + } + + blob->active.num_clusters = sz; + + return 0; +} + +static void +_spdk_blob_persist_start(struct spdk_blob_persist_ctx *ctx) +{ + spdk_bs_sequence_t *seq = ctx->seq; + struct spdk_blob *blob = ctx->blob; + struct spdk_blob_store *bs = blob->bs; + uint64_t i; + uint32_t page_num; + void *tmp; + int rc; + + if (blob->active.num_pages == 0) { + /* This is the signal that the blob should be deleted. + * Immediately jump to the clean up routine. */ + assert(blob->clean.num_pages > 0); + ctx->idx = blob->clean.num_pages - 1; + blob->state = SPDK_BLOB_STATE_CLEAN; + _spdk_blob_persist_zero_pages(seq, ctx, 0); + return; + + } + + /* Generate the new metadata */ + rc = _spdk_blob_serialize(blob, &ctx->pages, &blob->active.num_pages); + if (rc < 0) { + _spdk_blob_persist_complete(seq, ctx, rc); + return; + } + + assert(blob->active.num_pages >= 1); + + /* Resize the cache of page indices */ + tmp = realloc(blob->active.pages, blob->active.num_pages * sizeof(*blob->active.pages)); + if (!tmp) { + _spdk_blob_persist_complete(seq, ctx, -ENOMEM); + return; + } + blob->active.pages = tmp; + + /* Assign this metadata to pages. This requires two passes - + * one to verify that there are enough pages and a second + * to actually claim them. */ + page_num = 0; + /* Note that this loop starts at one. The first page location is fixed by the blobid. */ + for (i = 1; i < blob->active.num_pages; i++) { + page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); + if (page_num == UINT32_MAX) { + _spdk_blob_persist_complete(seq, ctx, -ENOMEM); + return; + } + page_num++; + } + + page_num = 0; + blob->active.pages[0] = _spdk_bs_blobid_to_page(blob->id); + for (i = 1; i < blob->active.num_pages; i++) { + page_num = spdk_bit_array_find_first_clear(bs->used_md_pages, page_num); + ctx->pages[i - 1].next = page_num; + /* Now that previous metadata page is complete, calculate the crc for it. */ + ctx->pages[i - 1].crc = _spdk_blob_md_page_calc_crc(&ctx->pages[i - 1]); + blob->active.pages[i] = page_num; + spdk_bit_array_set(bs->used_md_pages, page_num); + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Claiming page %u for blob %lu\n", page_num, blob->id); + page_num++; + } + ctx->pages[i - 1].crc = _spdk_blob_md_page_calc_crc(&ctx->pages[i - 1]); + /* Start writing the metadata from last page to first */ + ctx->idx = blob->active.num_pages - 1; + blob->state = SPDK_BLOB_STATE_CLEAN; + _spdk_blob_persist_write_page_chain(seq, ctx, 0); +} + +static void +_spdk_blob_persist_dirty_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + + ctx->blob->bs->clean = 0; + + spdk_dma_free(ctx->super); + + _spdk_blob_persist_start(ctx); +} + +static void +_spdk_bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, + struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg); + + +static void +_spdk_blob_persist_dirty(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_persist_ctx *ctx = cb_arg; + + ctx->super->clean = 0; + if (ctx->super->size == 0) { + ctx->super->size = ctx->blob->bs->dev->blockcnt * ctx->blob->bs->dev->blocklen; + } + + _spdk_bs_write_super(seq, ctx->blob->bs, ctx->super, _spdk_blob_persist_dirty_cpl, ctx); +} + + +/* Write a blob to disk */ +static void +_spdk_blob_persist(spdk_bs_sequence_t *seq, struct spdk_blob *blob, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_blob_persist_ctx *ctx; + + _spdk_blob_verify_md_op(blob); + + if (blob->state == SPDK_BLOB_STATE_CLEAN) { + cb_fn(seq, cb_arg, 0); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(seq, cb_arg, -ENOMEM); + return; + } + ctx->blob = blob; + ctx->seq = seq; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + if (blob->bs->clean) { + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + cb_fn(seq, cb_arg, -ENOMEM); + free(ctx); + return; + } + + spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(blob->bs, 0), + _spdk_bs_byte_to_lba(blob->bs, sizeof(*ctx->super)), + _spdk_blob_persist_dirty, ctx); + } else { + _spdk_blob_persist_start(ctx); + } +} + +struct spdk_blob_copy_cluster_ctx { + struct spdk_blob *blob; + uint8_t *buf; + uint64_t page; + uint64_t new_cluster; + spdk_bs_sequence_t *seq; +}; + +static void +_spdk_blob_allocate_and_copy_cluster_cpl(void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)ctx->seq; + TAILQ_HEAD(, spdk_bs_request_set) requests; + spdk_bs_user_op_t *op; + + TAILQ_INIT(&requests); + TAILQ_SWAP(&set->channel->need_cluster_alloc, &requests, spdk_bs_request_set, link); + + while (!TAILQ_EMPTY(&requests)) { + op = TAILQ_FIRST(&requests); + TAILQ_REMOVE(&requests, op, link); + if (bserrno == 0) { + spdk_bs_user_op_execute(op); + } else { + spdk_bs_user_op_abort(op); + } + } + + spdk_dma_free(ctx->buf); + free(ctx); +} + +static void +_spdk_blob_insert_cluster_cpl(void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + + if (bserrno) { + uint32_t cluster_number; + + if (bserrno == -EEXIST) { + /* The metadata insert failed because another thread + * allocated the cluster first. Free our cluster + * but continue without error. */ + bserrno = 0; + } + + cluster_number = _spdk_bs_page_to_cluster(ctx->blob->bs, ctx->page); + _spdk_bs_release_cluster(ctx->blob->bs, cluster_number); + } + + spdk_bs_sequence_finish(ctx->seq, bserrno); +} + +static void +_spdk_blob_write_copy_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + uint32_t cluster_number; + + if (bserrno) { + /* The write failed, so jump to the final completion handler */ + spdk_bs_sequence_finish(seq, bserrno); + return; + } + + cluster_number = _spdk_bs_page_to_cluster(ctx->blob->bs, ctx->page); + + _spdk_blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, + _spdk_blob_insert_cluster_cpl, ctx); +} + +static void +_spdk_blob_write_copy(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob_copy_cluster_ctx *ctx = cb_arg; + + if (bserrno != 0) { + /* The read failed, so jump to the final completion handler */ + spdk_bs_sequence_finish(seq, bserrno); + return; + } + + /* Write whole cluster */ + spdk_bs_sequence_write_dev(seq, ctx->buf, + _spdk_bs_cluster_to_lba(ctx->blob->bs, ctx->new_cluster), + _spdk_bs_cluster_to_lba(ctx->blob->bs, 1), + _spdk_blob_write_copy_cpl, ctx); +} + +static void +_spdk_bs_allocate_and_copy_cluster(struct spdk_blob *blob, + struct spdk_io_channel *_ch, + uint64_t io_unit, spdk_bs_user_op_t *op) +{ + struct spdk_bs_cpl cpl; + struct spdk_bs_channel *ch; + struct spdk_blob_copy_cluster_ctx *ctx; + uint32_t cluster_start_page; + uint32_t cluster_number; + int rc; + + ch = spdk_io_channel_get_ctx(_ch); + + if (!TAILQ_EMPTY(&ch->need_cluster_alloc)) { + /* There are already operations pending. Queue this user op + * and return because it will be re-executed when the outstanding + * cluster allocation completes. */ + TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); + return; + } + + /* Round the io_unit offset down to the first page in the cluster */ + cluster_start_page = _spdk_bs_io_unit_to_cluster_start(blob, io_unit); + + /* Calculate which index in the metadata cluster array the corresponding + * cluster is supposed to be at. */ + cluster_number = _spdk_bs_io_unit_to_cluster_number(blob, io_unit); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_bs_user_op_abort(op); + return; + } + + assert(blob->bs->cluster_sz % blob->back_bs_dev->blocklen == 0); + + ctx->blob = blob; + ctx->page = cluster_start_page; + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + ctx->buf = spdk_dma_malloc(blob->bs->cluster_sz, blob->back_bs_dev->blocklen, NULL); + if (!ctx->buf) { + SPDK_ERRLOG("DMA allocation for cluster of size = %" PRIu32 " failed.\n", + blob->bs->cluster_sz); + free(ctx); + spdk_bs_user_op_abort(op); + return; + } + } + + rc = _spdk_bs_allocate_cluster(blob, cluster_number, &ctx->new_cluster, false); + if (rc != 0) { + spdk_dma_free(ctx->buf); + free(ctx); + spdk_bs_user_op_abort(op); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = _spdk_blob_allocate_and_copy_cluster_cpl; + cpl.u.blob_basic.cb_arg = ctx; + + ctx->seq = spdk_bs_sequence_start(_ch, &cpl); + if (!ctx->seq) { + _spdk_bs_release_cluster(blob->bs, ctx->new_cluster); + spdk_dma_free(ctx->buf); + free(ctx); + spdk_bs_user_op_abort(op); + return; + } + + /* Queue the user op to block other incoming operations */ + TAILQ_INSERT_TAIL(&ch->need_cluster_alloc, op, link); + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + /* Read cluster from backing device */ + spdk_bs_sequence_read_bs_dev(ctx->seq, blob->back_bs_dev, ctx->buf, + _spdk_bs_dev_page_to_lba(blob->back_bs_dev, cluster_start_page), + _spdk_bs_dev_byte_to_lba(blob->back_bs_dev, blob->bs->cluster_sz), + _spdk_blob_write_copy, ctx); + } else { + _spdk_blob_insert_cluster_on_md_thread(ctx->blob, cluster_number, ctx->new_cluster, + _spdk_blob_insert_cluster_cpl, ctx); + } +} + +static void +_spdk_blob_calculate_lba_and_lba_count(struct spdk_blob *blob, uint64_t io_unit, uint64_t length, + uint64_t *lba, uint32_t *lba_count) +{ + *lba_count = length; + + if (!_spdk_bs_io_unit_is_allocated(blob, io_unit)) { + assert(blob->back_bs_dev != NULL); + *lba = _spdk_bs_io_unit_to_back_dev_lba(blob, io_unit); + *lba_count = _spdk_bs_io_unit_to_back_dev_lba(blob, *lba_count); + } else { + *lba = _spdk_bs_blob_io_unit_to_lba(blob, io_unit); + } +} + +struct op_split_ctx { + struct spdk_blob *blob; + struct spdk_io_channel *channel; + uint64_t io_unit_offset; + uint64_t io_units_remaining; + void *curr_payload; + enum spdk_blob_op_type op_type; + spdk_bs_sequence_t *seq; +}; + +static void +_spdk_blob_request_submit_op_split_next(void *cb_arg, int bserrno) +{ + struct op_split_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct spdk_io_channel *ch = ctx->channel; + enum spdk_blob_op_type op_type = ctx->op_type; + uint8_t *buf = ctx->curr_payload; + uint64_t offset = ctx->io_unit_offset; + uint64_t length = ctx->io_units_remaining; + uint64_t op_length; + + if (bserrno != 0 || ctx->io_units_remaining == 0) { + spdk_bs_sequence_finish(ctx->seq, bserrno); + free(ctx); + return; + } + + op_length = spdk_min(length, _spdk_bs_num_io_units_to_cluster_boundary(blob, + offset)); + + /* Update length and payload for next operation */ + ctx->io_units_remaining -= op_length; + ctx->io_unit_offset += op_length; + if (op_type == SPDK_BLOB_WRITE || op_type == SPDK_BLOB_READ) { + ctx->curr_payload += op_length * blob->bs->io_unit_size; + } + + switch (op_type) { + case SPDK_BLOB_READ: + spdk_blob_io_read(blob, ch, buf, offset, op_length, + _spdk_blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_WRITE: + spdk_blob_io_write(blob, ch, buf, offset, op_length, + _spdk_blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_UNMAP: + spdk_blob_io_unmap(blob, ch, offset, op_length, + _spdk_blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_WRITE_ZEROES: + spdk_blob_io_write_zeroes(blob, ch, offset, op_length, + _spdk_blob_request_submit_op_split_next, ctx); + break; + case SPDK_BLOB_READV: + case SPDK_BLOB_WRITEV: + SPDK_ERRLOG("readv/write not valid for %s\n", __func__); + spdk_bs_sequence_finish(ctx->seq, -EINVAL); + free(ctx); + break; + } +} + +static void +_spdk_blob_request_submit_op_split(struct spdk_io_channel *ch, struct spdk_blob *blob, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + struct op_split_ctx *ctx; + spdk_bs_sequence_t *seq; + struct spdk_bs_cpl cpl; + + assert(blob != NULL); + + ctx = calloc(1, sizeof(struct op_split_ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(ch, &cpl); + if (!seq) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->channel = ch; + ctx->curr_payload = payload; + ctx->io_unit_offset = offset; + ctx->io_units_remaining = length; + ctx->op_type = op_type; + ctx->seq = seq; + + _spdk_blob_request_submit_op_split_next(ctx, 0); +} + +static void +_spdk_blob_request_submit_op_single(struct spdk_io_channel *_ch, struct spdk_blob *blob, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + struct spdk_bs_cpl cpl; + uint64_t lba; + uint32_t lba_count; + + assert(blob != NULL); + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + _spdk_blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + + if (blob->frozen_refcnt) { + /* This blob I/O is frozen */ + spdk_bs_user_op_t *op; + struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_ch); + + op = spdk_bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); + + return; + } + + switch (op_type) { + case SPDK_BLOB_READ: { + spdk_bs_batch_t *batch; + + batch = spdk_bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (_spdk_bs_io_unit_is_allocated(blob, offset)) { + /* Read from the blob */ + spdk_bs_batch_read_dev(batch, payload, lba, lba_count); + } else { + /* Read from the backing block device */ + spdk_bs_batch_read_bs_dev(batch, blob->back_bs_dev, payload, lba, lba_count); + } + + spdk_bs_batch_close(batch); + break; + } + case SPDK_BLOB_WRITE: + case SPDK_BLOB_WRITE_ZEROES: { + if (_spdk_bs_io_unit_is_allocated(blob, offset)) { + /* Write to the blob */ + spdk_bs_batch_t *batch; + + if (lba_count == 0) { + cb_fn(cb_arg, 0); + return; + } + + batch = spdk_bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (op_type == SPDK_BLOB_WRITE) { + spdk_bs_batch_write_dev(batch, payload, lba, lba_count); + } else { + spdk_bs_batch_write_zeroes_dev(batch, lba, lba_count); + } + + spdk_bs_batch_close(batch); + } else { + /* Queue this operation and allocate the cluster */ + spdk_bs_user_op_t *op; + + op = spdk_bs_user_op_alloc(_ch, &cpl, op_type, blob, payload, 0, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + _spdk_bs_allocate_and_copy_cluster(blob, _ch, offset, op); + } + break; + } + case SPDK_BLOB_UNMAP: { + spdk_bs_batch_t *batch; + + batch = spdk_bs_batch_open(_ch, &cpl); + if (!batch) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (_spdk_bs_io_unit_is_allocated(blob, offset)) { + spdk_bs_batch_unmap_dev(batch, lba, lba_count); + } + + spdk_bs_batch_close(batch); + break; + } + case SPDK_BLOB_READV: + case SPDK_BLOB_WRITEV: + SPDK_ERRLOG("readv/write not valid\n"); + cb_fn(cb_arg, -EINVAL); + break; + } +} + +static void +_spdk_blob_request_submit_op(struct spdk_blob *blob, struct spdk_io_channel *_channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, enum spdk_blob_op_type op_type) +{ + assert(blob != NULL); + + if (blob->data_ro && op_type != SPDK_BLOB_READ) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (offset + length > _spdk_bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { + cb_fn(cb_arg, -EINVAL); + return; + } + if (length <= _spdk_bs_num_io_units_to_cluster_boundary(blob, offset)) { + _spdk_blob_request_submit_op_single(_channel, blob, payload, offset, length, + cb_fn, cb_arg, op_type); + } else { + _spdk_blob_request_submit_op_split(_channel, blob, payload, offset, length, + cb_fn, cb_arg, op_type); + } +} + +struct rw_iov_ctx { + struct spdk_blob *blob; + struct spdk_io_channel *channel; + spdk_blob_op_complete cb_fn; + void *cb_arg; + bool read; + int iovcnt; + struct iovec *orig_iov; + uint64_t io_unit_offset; + uint64_t io_units_remaining; + uint64_t io_units_done; + struct iovec iov[0]; +}; + +static void +_spdk_rw_iov_done(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + assert(cb_arg == NULL); + spdk_bs_sequence_finish(seq, bserrno); +} + +static void +_spdk_rw_iov_split_next(void *cb_arg, int bserrno) +{ + struct rw_iov_ctx *ctx = cb_arg; + struct spdk_blob *blob = ctx->blob; + struct iovec *iov, *orig_iov; + int iovcnt; + size_t orig_iovoff; + uint64_t io_units_count, io_units_to_boundary, io_unit_offset; + uint64_t byte_count; + + if (bserrno != 0 || ctx->io_units_remaining == 0) { + ctx->cb_fn(ctx->cb_arg, bserrno); + free(ctx); + return; + } + + io_unit_offset = ctx->io_unit_offset; + io_units_to_boundary = _spdk_bs_num_io_units_to_cluster_boundary(blob, io_unit_offset); + io_units_count = spdk_min(ctx->io_units_remaining, io_units_to_boundary); + /* + * Get index and offset into the original iov array for our current position in the I/O sequence. + * byte_count will keep track of how many bytes remaining until orig_iov and orig_iovoff will + * point to the current position in the I/O sequence. + */ + byte_count = ctx->io_units_done * blob->bs->io_unit_size; + orig_iov = &ctx->orig_iov[0]; + orig_iovoff = 0; + while (byte_count > 0) { + if (byte_count >= orig_iov->iov_len) { + byte_count -= orig_iov->iov_len; + orig_iov++; + } else { + orig_iovoff = byte_count; + byte_count = 0; + } + } + + /* + * Build an iov array for the next I/O in the sequence. byte_count will keep track of how many + * bytes of this next I/O remain to be accounted for in the new iov array. + */ + byte_count = io_units_count * blob->bs->io_unit_size; + iov = &ctx->iov[0]; + iovcnt = 0; + while (byte_count > 0) { + assert(iovcnt < ctx->iovcnt); + iov->iov_len = spdk_min(byte_count, orig_iov->iov_len - orig_iovoff); + iov->iov_base = orig_iov->iov_base + orig_iovoff; + byte_count -= iov->iov_len; + orig_iovoff = 0; + orig_iov++; + iov++; + iovcnt++; + } + + ctx->io_unit_offset += io_units_count; + ctx->io_units_remaining -= io_units_count; + ctx->io_units_done += io_units_count; + iov = &ctx->iov[0]; + + if (ctx->read) { + spdk_blob_io_readv(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, + io_units_count, _spdk_rw_iov_split_next, ctx); + } else { + spdk_blob_io_writev(ctx->blob, ctx->channel, iov, iovcnt, io_unit_offset, + io_units_count, _spdk_rw_iov_split_next, ctx); + } +} + +static void +_spdk_blob_request_submit_rw_iov(struct spdk_blob *blob, struct spdk_io_channel *_channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg, bool read) +{ + struct spdk_bs_cpl cpl; + + assert(blob != NULL); + + if (!read && blob->data_ro) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (length == 0) { + cb_fn(cb_arg, 0); + return; + } + + if (offset + length > _spdk_bs_cluster_to_lba(blob->bs, blob->active.num_clusters)) { + cb_fn(cb_arg, -EINVAL); + return; + } + + /* + * For now, we implement readv/writev using a sequence (instead of a batch) to account for having + * to split a request that spans a cluster boundary. For I/O that do not span a cluster boundary, + * there will be no noticeable difference compared to using a batch. For I/O that do span a cluster + * boundary, the target LBAs (after blob offset to LBA translation) may not be contiguous, so we need + * to allocate a separate iov array and split the I/O such that none of the resulting + * smaller I/O cross a cluster boundary. These smaller I/O will be issued in sequence (not in parallel) + * but since this case happens very infrequently, any performance impact will be negligible. + * + * This could be optimized in the future to allocate a big enough iov array to account for all of the iovs + * for all of the smaller I/Os, pre-build all of the iov arrays for the smaller I/Os, then issue them + * in a batch. That would also require creating an intermediate spdk_bs_cpl that would get called + * when the batch was completed, to allow for freeing the memory for the iov arrays. + */ + if (spdk_likely(length <= _spdk_bs_num_io_units_to_cluster_boundary(blob, offset))) { + uint32_t lba_count; + uint64_t lba; + + _spdk_blob_calculate_lba_and_lba_count(blob, offset, length, &lba, &lba_count); + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + if (blob->frozen_refcnt) { + /* This blob I/O is frozen */ + spdk_bs_user_op_t *op; + struct spdk_bs_channel *bs_channel = spdk_io_channel_get_ctx(_channel); + + op = spdk_bs_user_op_alloc(_channel, &cpl, read, blob, iov, iovcnt, offset, length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + TAILQ_INSERT_TAIL(&bs_channel->queued_io, op, link); + + return; + } + + if (read) { + spdk_bs_sequence_t *seq; + + seq = spdk_bs_sequence_start(_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + if (_spdk_bs_io_unit_is_allocated(blob, offset)) { + spdk_bs_sequence_readv_dev(seq, iov, iovcnt, lba, lba_count, _spdk_rw_iov_done, NULL); + } else { + spdk_bs_sequence_readv_bs_dev(seq, blob->back_bs_dev, iov, iovcnt, lba, lba_count, + _spdk_rw_iov_done, NULL); + } + } else { + if (_spdk_bs_io_unit_is_allocated(blob, offset)) { + spdk_bs_sequence_t *seq; + + seq = spdk_bs_sequence_start(_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + spdk_bs_sequence_writev_dev(seq, iov, iovcnt, lba, lba_count, _spdk_rw_iov_done, NULL); + } else { + /* Queue this operation and allocate the cluster */ + spdk_bs_user_op_t *op; + + op = spdk_bs_user_op_alloc(_channel, &cpl, SPDK_BLOB_WRITEV, blob, iov, iovcnt, offset, + length); + if (!op) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + _spdk_bs_allocate_and_copy_cluster(blob, _channel, offset, op); + } + } + } else { + struct rw_iov_ctx *ctx; + + ctx = calloc(1, sizeof(struct rw_iov_ctx) + iovcnt * sizeof(struct iovec)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->blob = blob; + ctx->channel = _channel; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->read = read; + ctx->orig_iov = iov; + ctx->iovcnt = iovcnt; + ctx->io_unit_offset = offset; + ctx->io_units_remaining = length; + ctx->io_units_done = 0; + + _spdk_rw_iov_split_next(ctx, 0); + } +} + +static struct spdk_blob * +_spdk_blob_lookup(struct spdk_blob_store *bs, spdk_blob_id blobid) +{ + struct spdk_blob *blob; + + TAILQ_FOREACH(blob, &bs->blobs, link) { + if (blob->id == blobid) { + return blob; + } + } + + return NULL; +} + +static int +_spdk_bs_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_blob_store *bs = io_device; + struct spdk_bs_channel *channel = ctx_buf; + struct spdk_bs_dev *dev; + uint32_t max_ops = bs->max_channel_ops; + uint32_t i; + + dev = bs->dev; + + channel->req_mem = calloc(max_ops, sizeof(struct spdk_bs_request_set)); + if (!channel->req_mem) { + return -1; + } + + TAILQ_INIT(&channel->reqs); + + for (i = 0; i < max_ops; i++) { + TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); + } + + channel->bs = bs; + channel->dev = dev; + channel->dev_channel = dev->create_channel(dev); + + if (!channel->dev_channel) { + SPDK_ERRLOG("Failed to create device channel.\n"); + free(channel->req_mem); + return -1; + } + + TAILQ_INIT(&channel->need_cluster_alloc); + TAILQ_INIT(&channel->queued_io); + + return 0; +} + +static void +_spdk_bs_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_bs_channel *channel = ctx_buf; + spdk_bs_user_op_t *op; + + while (!TAILQ_EMPTY(&channel->need_cluster_alloc)) { + op = TAILQ_FIRST(&channel->need_cluster_alloc); + TAILQ_REMOVE(&channel->need_cluster_alloc, op, link); + spdk_bs_user_op_abort(op); + } + + while (!TAILQ_EMPTY(&channel->queued_io)) { + op = TAILQ_FIRST(&channel->queued_io); + TAILQ_REMOVE(&channel->queued_io, op, link); + spdk_bs_user_op_abort(op); + } + + free(channel->req_mem); + channel->dev->destroy_channel(channel->dev, channel->dev_channel); +} + +static void +_spdk_bs_dev_destroy(void *io_device) +{ + struct spdk_blob_store *bs = io_device; + struct spdk_blob *blob, *blob_tmp; + + bs->dev->destroy(bs->dev); + + TAILQ_FOREACH_SAFE(blob, &bs->blobs, link, blob_tmp) { + TAILQ_REMOVE(&bs->blobs, blob, link); + _spdk_blob_free(blob); + } + + pthread_mutex_destroy(&bs->used_clusters_mutex); + + spdk_bit_array_free(&bs->used_blobids); + spdk_bit_array_free(&bs->used_md_pages); + spdk_bit_array_free(&bs->used_clusters); + /* + * If this function is called for any reason except a successful unload, + * the unload_cpl type will be NONE and this will be a nop. + */ + spdk_bs_call_cpl(&bs->unload_cpl, bs->unload_err); + + free(bs); +} + +static int +_spdk_bs_blob_list_add(struct spdk_blob *blob) +{ + spdk_blob_id snapshot_id; + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + + assert(blob != NULL); + + snapshot_id = blob->parent_id; + if (snapshot_id == SPDK_BLOBID_INVALID) { + return 0; + } + + TAILQ_FOREACH(snapshot_entry, &blob->bs->snapshots, link) { + if (snapshot_entry->id == snapshot_id) { + break; + } + } + + if (snapshot_entry == NULL) { + /* Snapshot not found */ + snapshot_entry = calloc(1, sizeof(struct spdk_blob_list)); + if (snapshot_entry == NULL) { + return -ENOMEM; + } + snapshot_entry->id = snapshot_id; + TAILQ_INIT(&snapshot_entry->clones); + TAILQ_INSERT_TAIL(&blob->bs->snapshots, snapshot_entry, link); + } else { + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + if (clone_entry->id == blob->id) { + break; + } + } + } + + if (clone_entry == NULL) { + /* Clone not found */ + clone_entry = calloc(1, sizeof(struct spdk_blob_list)); + if (clone_entry == NULL) { + return -ENOMEM; + } + clone_entry->id = blob->id; + TAILQ_INIT(&clone_entry->clones); + TAILQ_INSERT_TAIL(&snapshot_entry->clones, clone_entry, link); + snapshot_entry->clone_count++; + } + + return 0; +} + +static int +_spdk_bs_blob_list_remove(struct spdk_blob *blob) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + spdk_blob_id snapshot_id; + + assert(blob != NULL); + + snapshot_id = blob->parent_id; + if (snapshot_id == SPDK_BLOBID_INVALID) { + return 0; + } + + TAILQ_FOREACH(snapshot_entry, &blob->bs->snapshots, link) { + if (snapshot_entry->id == snapshot_id) { + break; + } + } + + assert(snapshot_entry != NULL); + + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + if (clone_entry->id == blob->id) { + break; + } + } + + assert(clone_entry != NULL); + + blob->parent_id = SPDK_BLOBID_INVALID; + TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); + free(clone_entry); + + snapshot_entry->clone_count--; + if (snapshot_entry->clone_count == 0) { + /* Snapshot have no more clones */ + TAILQ_REMOVE(&blob->bs->snapshots, snapshot_entry, link); + free(snapshot_entry); + } + + return 0; +} + +static int +_spdk_bs_blob_list_free(struct spdk_blob_store *bs) +{ + struct spdk_blob_list *snapshot_entry; + struct spdk_blob_list *snapshot_entry_tmp; + struct spdk_blob_list *clone_entry; + struct spdk_blob_list *clone_entry_tmp; + + TAILQ_FOREACH_SAFE(snapshot_entry, &bs->snapshots, link, snapshot_entry_tmp) { + TAILQ_FOREACH_SAFE(clone_entry, &snapshot_entry->clones, link, clone_entry_tmp) { + TAILQ_REMOVE(&snapshot_entry->clones, clone_entry, link); + free(clone_entry); + } + TAILQ_REMOVE(&bs->snapshots, snapshot_entry, link); + free(snapshot_entry); + } + + return 0; +} + +static void +_spdk_bs_free(struct spdk_blob_store *bs) +{ + _spdk_bs_blob_list_free(bs); + + spdk_bs_unregister_md_thread(bs); + spdk_io_device_unregister(bs, _spdk_bs_dev_destroy); +} + +void +spdk_bs_opts_init(struct spdk_bs_opts *opts) +{ + opts->cluster_sz = SPDK_BLOB_OPTS_CLUSTER_SZ; + opts->num_md_pages = SPDK_BLOB_OPTS_NUM_MD_PAGES; + opts->max_md_ops = SPDK_BLOB_OPTS_MAX_MD_OPS; + opts->max_channel_ops = SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS; + memset(&opts->bstype, 0, sizeof(opts->bstype)); + opts->iter_cb_fn = NULL; + opts->iter_cb_arg = NULL; +} + +static int +_spdk_bs_opts_verify(struct spdk_bs_opts *opts) +{ + if (opts->cluster_sz == 0 || opts->num_md_pages == 0 || opts->max_md_ops == 0 || + opts->max_channel_ops == 0) { + SPDK_ERRLOG("Blobstore options cannot be set to 0\n"); + return -1; + } + + return 0; +} + +static int +_spdk_bs_alloc(struct spdk_bs_dev *dev, struct spdk_bs_opts *opts, struct spdk_blob_store **_bs) +{ + struct spdk_blob_store *bs; + uint64_t dev_size; + int rc; + + dev_size = dev->blocklen * dev->blockcnt; + if (dev_size < opts->cluster_sz) { + /* Device size cannot be smaller than cluster size of blobstore */ + SPDK_INFOLOG(SPDK_LOG_BLOB, "Device size %" PRIu64 " is smaller than cluster size %" PRIu32 "\n", + dev_size, opts->cluster_sz); + return -ENOSPC; + } + if (opts->cluster_sz < SPDK_BS_PAGE_SIZE) { + /* Cluster size cannot be smaller than page size */ + SPDK_ERRLOG("Cluster size %" PRIu32 " is smaller than page size %d\n", + opts->cluster_sz, SPDK_BS_PAGE_SIZE); + return -EINVAL; + } + bs = calloc(1, sizeof(struct spdk_blob_store)); + if (!bs) { + return -ENOMEM; + } + + TAILQ_INIT(&bs->blobs); + TAILQ_INIT(&bs->snapshots); + bs->dev = dev; + bs->md_thread = spdk_get_thread(); + assert(bs->md_thread != NULL); + + /* + * Do not use _spdk_bs_lba_to_cluster() here since blockcnt may not be an + * even multiple of the cluster size. + */ + bs->cluster_sz = opts->cluster_sz; + bs->total_clusters = dev->blockcnt / (bs->cluster_sz / dev->blocklen); + bs->pages_per_cluster = bs->cluster_sz / SPDK_BS_PAGE_SIZE; + bs->num_free_clusters = bs->total_clusters; + bs->used_clusters = spdk_bit_array_create(bs->total_clusters); + bs->io_unit_size = dev->blocklen; + if (bs->used_clusters == NULL) { + free(bs); + return -ENOMEM; + } + + bs->max_channel_ops = opts->max_channel_ops; + bs->super_blob = SPDK_BLOBID_INVALID; + memcpy(&bs->bstype, &opts->bstype, sizeof(opts->bstype)); + + /* The metadata is assumed to be at least 1 page */ + bs->used_md_pages = spdk_bit_array_create(1); + bs->used_blobids = spdk_bit_array_create(0); + + pthread_mutex_init(&bs->used_clusters_mutex, NULL); + + spdk_io_device_register(bs, _spdk_bs_channel_create, _spdk_bs_channel_destroy, + sizeof(struct spdk_bs_channel), "blobstore"); + rc = spdk_bs_register_md_thread(bs); + if (rc == -1) { + spdk_io_device_unregister(bs, NULL); + pthread_mutex_destroy(&bs->used_clusters_mutex); + spdk_bit_array_free(&bs->used_blobids); + spdk_bit_array_free(&bs->used_md_pages); + spdk_bit_array_free(&bs->used_clusters); + free(bs); + /* FIXME: this is a lie but don't know how to get a proper error code here */ + return -ENOMEM; + } + + *_bs = bs; + return 0; +} + +/* START spdk_bs_load, spdk_bs_load_ctx will used for both load and unload. */ + +struct spdk_bs_load_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; + + struct spdk_bs_md_mask *mask; + bool in_page_chain; + uint32_t page_index; + uint32_t cur_page; + struct spdk_blob_md_page *page; + bool is_load; + + spdk_bs_sequence_t *seq; + spdk_blob_op_with_handle_complete iter_cb_fn; + void *iter_cb_arg; +}; + +static void +_spdk_bs_load_ctx_fail(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) +{ + assert(bserrno != 0); + + spdk_dma_free(ctx->super); + spdk_bs_sequence_finish(seq, bserrno); + /* + * Only free the blobstore when a load fails. If an unload fails (for some reason) + * we want to keep the blobstore in case the caller wants to try again. + */ + if (ctx->is_load) { + _spdk_bs_free(ctx->bs); + } + free(ctx); +} + +static void +_spdk_bs_set_mask(struct spdk_bit_array *array, struct spdk_bs_md_mask *mask) +{ + uint32_t i = 0; + + while (true) { + i = spdk_bit_array_find_first_set(array, i); + if (i >= mask->length) { + break; + } + mask->mask[i / 8] |= 1U << (i % 8); + i++; + } +} + +static int +_spdk_bs_load_mask(struct spdk_bit_array **array_ptr, struct spdk_bs_md_mask *mask) +{ + struct spdk_bit_array *array; + uint32_t i; + + if (spdk_bit_array_resize(array_ptr, mask->length) < 0) { + return -ENOMEM; + } + + array = *array_ptr; + for (i = 0; i < mask->length; i++) { + if (mask->mask[i / 8] & (1U << (i % 8))) { + spdk_bit_array_set(array, i); + } + } + + return 0; +} + +static void +_spdk_bs_write_super(spdk_bs_sequence_t *seq, struct spdk_blob_store *bs, + struct spdk_bs_super_block *super, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + /* Update the values in the super block */ + super->super_blob = bs->super_blob; + memcpy(&super->bstype, &bs->bstype, sizeof(bs->bstype)); + super->crc = _spdk_blob_md_page_calc_crc(super); + spdk_bs_sequence_write_dev(seq, super, _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(*super)), + cb_fn, cb_arg); +} + +static void +_spdk_bs_write_used_clusters(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + /* Write out the used clusters mask */ + mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_CLUSTERS; + ctx->mask->length = ctx->bs->total_clusters; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_clusters)); + + _spdk_bs_set_mask(ctx->bs->used_clusters, ctx->mask); + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); + spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +_spdk_bs_write_used_md(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_PAGES; + ctx->mask->length = ctx->super->md_len; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_md_pages)); + + _spdk_bs_set_mask(ctx->bs->used_md_pages, ctx->mask); + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); + spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +_spdk_bs_write_used_blobids(spdk_bs_sequence_t *seq, void *arg, spdk_bs_sequence_cpl cb_fn) +{ + struct spdk_bs_load_ctx *ctx = arg; + uint64_t mask_size, lba, lba_count; + + if (ctx->super->used_blobid_mask_len == 0) { + /* + * This is a pre-v3 on-disk format where the blobid mask does not get + * written to disk. + */ + cb_fn(seq, arg, 0); + return; + } + + mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + ctx->mask->type = SPDK_MD_MASK_TYPE_USED_BLOBIDS; + ctx->mask->length = ctx->super->md_len; + assert(ctx->mask->length == spdk_bit_array_capacity(ctx->bs->used_blobids)); + + _spdk_bs_set_mask(ctx->bs->used_blobids, ctx->mask); + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); + spdk_bs_sequence_write_dev(seq, ctx->mask, lba, lba_count, cb_fn, arg); +} + +static void +_spdk_bs_load_iter(void *arg, struct spdk_blob *blob, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = arg; + + if (bserrno == 0) { + if (ctx->iter_cb_fn) { + ctx->iter_cb_fn(ctx->iter_cb_arg, blob, 0); + } + _spdk_bs_blob_list_add(blob); + spdk_bs_iter_next(ctx->bs, blob, _spdk_bs_load_iter, ctx); + return; + } + + if (bserrno == -ENOENT) { + bserrno = 0; + } else { + /* + * This case needs to be looked at further. Same problem + * exists with applications that rely on explicit blob + * iteration. We should just skip the blob that failed + * to load and continue on to the next one. + */ + SPDK_ERRLOG("Error in iterating blobs\n"); + } + + ctx->iter_cb_fn = NULL; + + spdk_dma_free(ctx->super); + spdk_dma_free(ctx->mask); + spdk_bs_sequence_finish(ctx->seq, bserrno); + free(ctx); +} + +static void +_spdk_bs_load_complete(spdk_bs_sequence_t *seq, struct spdk_bs_load_ctx *ctx, int bserrno) +{ + ctx->seq = seq; + spdk_bs_iter_first(ctx->bs, _spdk_bs_load_iter, ctx); +} + +static void +_spdk_bs_load_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + int rc; + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_BLOBIDS); + + /* The length of the mask (in bits) must not be greater than + * the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE * 8)); + + /* The length of the mask must be exactly equal to the size + * (in pages) of the metadata region */ + assert(ctx->mask->length == ctx->super->md_len); + + rc = _spdk_bs_load_mask(&ctx->bs->used_blobids, ctx->mask); + if (rc < 0) { + spdk_dma_free(ctx->mask); + _spdk_bs_load_ctx_fail(seq, ctx, rc); + return; + } + + _spdk_bs_load_complete(seq, ctx, bserrno); +} + +static void +_spdk_bs_load_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba, lba_count, mask_size; + int rc; + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_CLUSTERS); + /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_cluster_mask_len * sizeof( + struct spdk_blob_md_page) * 8)); + /* The length of the mask must be exactly equal to the total number of clusters */ + assert(ctx->mask->length == ctx->bs->total_clusters); + + rc = _spdk_bs_load_mask(&ctx->bs->used_clusters, ctx->mask); + if (rc < 0) { + spdk_dma_free(ctx->mask); + _spdk_bs_load_ctx_fail(seq, ctx, rc); + return; + } + + ctx->bs->num_free_clusters = spdk_bit_array_count_clear(ctx->bs->used_clusters); + assert(ctx->bs->num_free_clusters <= ctx->bs->total_clusters); + + spdk_dma_free(ctx->mask); + + /* Read the used blobids mask */ + mask_size = ctx->super->used_blobid_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_blobid_mask_len); + spdk_bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, + _spdk_bs_load_used_blobids_cpl, ctx); +} + +static void +_spdk_bs_load_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba, lba_count, mask_size; + int rc; + + /* The type must be correct */ + assert(ctx->mask->type == SPDK_MD_MASK_TYPE_USED_PAGES); + /* The length of the mask (in bits) must not be greater than the length of the buffer (converted to bits) */ + assert(ctx->mask->length <= (ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE * + 8)); + /* The length of the mask must be exactly equal to the size (in pages) of the metadata region */ + assert(ctx->mask->length == ctx->super->md_len); + + rc = _spdk_bs_load_mask(&ctx->bs->used_md_pages, ctx->mask); + if (rc < 0) { + spdk_dma_free(ctx->mask); + _spdk_bs_load_ctx_fail(seq, ctx, rc); + return; + } + + spdk_dma_free(ctx->mask); + + /* Read the used clusters mask */ + mask_size = ctx->super->used_cluster_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_cluster_mask_len); + spdk_bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, + _spdk_bs_load_used_clusters_cpl, ctx); +} + +static void +_spdk_bs_load_read_used_pages(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba, lba_count, mask_size; + + /* Read the used pages mask */ + mask_size = ctx->super->used_page_mask_len * SPDK_BS_PAGE_SIZE; + ctx->mask = spdk_dma_zmalloc(mask_size, 0x1000, NULL); + if (!ctx->mask) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_start); + lba_count = _spdk_bs_page_to_lba(ctx->bs, ctx->super->used_page_mask_len); + spdk_bs_sequence_read_dev(seq, ctx->mask, lba, lba_count, + _spdk_bs_load_used_pages_cpl, ctx); +} + +static int +_spdk_bs_load_replay_md_parse_page(const struct spdk_blob_md_page *page, struct spdk_blob_store *bs) +{ + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT) { + struct spdk_blob_md_descriptor_extent *desc_extent; + unsigned int i, j; + unsigned int cluster_count = 0; + uint32_t cluster_idx; + + desc_extent = (struct spdk_blob_md_descriptor_extent *)desc; + + for (i = 0; i < desc_extent->length / sizeof(desc_extent->extents[0]); i++) { + for (j = 0; j < desc_extent->extents[i].length; j++) { + cluster_idx = desc_extent->extents[i].cluster_idx; + /* + * cluster_idx = 0 means an unallocated cluster - don't mark that + * in the used cluster map. + */ + if (cluster_idx != 0) { + spdk_bit_array_set(bs->used_clusters, cluster_idx + j); + if (bs->num_free_clusters == 0) { + return -ENOSPC; + } + bs->num_free_clusters--; + } + cluster_count++; + } + } + if (cluster_count == 0) { + return -EINVAL; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + /* Skip this item */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + /* Skip this item */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + /* Skip this item */ + } else { + /* Error */ + return -EINVAL; + } + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } + return 0; +} + +static bool _spdk_bs_load_cur_md_page_valid(struct spdk_bs_load_ctx *ctx) +{ + uint32_t crc; + + crc = _spdk_blob_md_page_calc_crc(ctx->page); + if (crc != ctx->page->crc) { + return false; + } + + if (_spdk_bs_page_to_blobid(ctx->cur_page) != ctx->page->id) { + return false; + } + return true; +} + +static void +_spdk_bs_load_replay_cur_md_page(spdk_bs_sequence_t *seq, void *cb_arg); + +static void +_spdk_bs_load_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + _spdk_bs_load_complete(seq, ctx, bserrno); +} + +static void +_spdk_bs_load_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->mask); + ctx->mask = NULL; + + _spdk_bs_write_used_clusters(seq, cb_arg, _spdk_bs_load_write_used_clusters_cpl); +} + +static void +_spdk_bs_load_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->mask); + ctx->mask = NULL; + + _spdk_bs_write_used_blobids(seq, cb_arg, _spdk_bs_load_write_used_blobids_cpl); +} + +static void +_spdk_bs_load_write_used_md(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + _spdk_bs_write_used_md(seq, cb_arg, _spdk_bs_load_write_used_pages_cpl); +} + +static void +_spdk_bs_load_replay_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t num_md_clusters; + uint64_t i; + uint32_t page_num; + + if (bserrno != 0) { + _spdk_bs_load_ctx_fail(seq, ctx, bserrno); + return; + } + + page_num = ctx->cur_page; + if (_spdk_bs_load_cur_md_page_valid(ctx) == true) { + if (ctx->page->sequence_num == 0 || ctx->in_page_chain == true) { + spdk_bit_array_set(ctx->bs->used_md_pages, page_num); + if (ctx->page->sequence_num == 0) { + spdk_bit_array_set(ctx->bs->used_blobids, page_num); + } + if (_spdk_bs_load_replay_md_parse_page(ctx->page, ctx->bs)) { + _spdk_bs_load_ctx_fail(seq, ctx, -EILSEQ); + return; + } + if (ctx->page->next != SPDK_INVALID_MD_PAGE) { + ctx->in_page_chain = true; + ctx->cur_page = ctx->page->next; + _spdk_bs_load_replay_cur_md_page(seq, cb_arg); + return; + } + } + } + + ctx->in_page_chain = false; + + do { + ctx->page_index++; + } while (spdk_bit_array_get(ctx->bs->used_md_pages, ctx->page_index) == true); + + if (ctx->page_index < ctx->super->md_len) { + ctx->cur_page = ctx->page_index; + _spdk_bs_load_replay_cur_md_page(seq, cb_arg); + } else { + /* Claim all of the clusters used by the metadata */ + num_md_clusters = divide_round_up(ctx->super->md_len, ctx->bs->pages_per_cluster); + for (i = 0; i < num_md_clusters; i++) { + _spdk_bs_claim_cluster(ctx->bs, i); + } + spdk_dma_free(ctx->page); + _spdk_bs_load_write_used_md(seq, ctx, bserrno); + } +} + +static void +_spdk_bs_load_replay_cur_md_page(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint64_t lba; + + assert(ctx->cur_page < ctx->super->md_len); + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); + spdk_bs_sequence_read_dev(seq, ctx->page, lba, + _spdk_bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), + _spdk_bs_load_replay_md_cpl, ctx); +} + +static void +_spdk_bs_load_replay_md(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + ctx->page_index = 0; + ctx->cur_page = 0; + ctx->page = spdk_dma_zmalloc(SPDK_BS_PAGE_SIZE, + SPDK_BS_PAGE_SIZE, + NULL); + if (!ctx->page) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + _spdk_bs_load_replay_cur_md_page(seq, cb_arg); +} + +static void +_spdk_bs_recover(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + int rc; + + rc = spdk_bit_array_resize(&ctx->bs->used_md_pages, ctx->super->md_len); + if (rc < 0) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->used_blobids, ctx->super->md_len); + if (rc < 0) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + if (rc < 0) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + + ctx->bs->num_free_clusters = ctx->bs->total_clusters; + _spdk_bs_load_replay_md(seq, cb_arg); +} + +static void +_spdk_bs_load_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + uint32_t crc; + int rc; + static const char zeros[SPDK_BLOBSTORE_TYPE_LENGTH]; + + if (ctx->super->version > SPDK_BS_VERSION || + ctx->super->version < SPDK_BS_INITIAL_VERSION) { + _spdk_bs_load_ctx_fail(seq, ctx, -EILSEQ); + return; + } + + if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)) != 0) { + _spdk_bs_load_ctx_fail(seq, ctx, -EILSEQ); + return; + } + + crc = _spdk_blob_md_page_calc_crc(ctx->super); + if (crc != ctx->super->crc) { + _spdk_bs_load_ctx_fail(seq, ctx, -EILSEQ); + return; + } + + if (memcmp(&ctx->bs->bstype, &ctx->super->bstype, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype matched - loading blobstore\n"); + } else if (memcmp(&ctx->bs->bstype, zeros, SPDK_BLOBSTORE_TYPE_LENGTH) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Bstype wildcard used - loading blobstore regardless bstype\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Unexpected bstype\n"); + SPDK_TRACEDUMP(SPDK_LOG_BLOB, "Expected:", ctx->bs->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); + SPDK_TRACEDUMP(SPDK_LOG_BLOB, "Found:", ctx->super->bstype.bstype, SPDK_BLOBSTORE_TYPE_LENGTH); + _spdk_bs_load_ctx_fail(seq, ctx, -ENXIO); + return; + } + + if (ctx->super->size > ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen) { + SPDK_NOTICELOG("Size mismatch, dev size: %lu, blobstore size: %lu\n", + ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen, ctx->super->size); + _spdk_bs_load_ctx_fail(seq, ctx, -EILSEQ); + return; + } + + if (ctx->super->size == 0) { + ctx->super->size = ctx->bs->dev->blockcnt * ctx->bs->dev->blocklen; + } + + if (ctx->super->io_unit_size == 0) { + ctx->super->io_unit_size = SPDK_BS_PAGE_SIZE; + } + + /* Parse the super block */ + ctx->bs->clean = 1; + ctx->bs->cluster_sz = ctx->super->cluster_size; + ctx->bs->total_clusters = ctx->super->size / ctx->super->cluster_size; + ctx->bs->pages_per_cluster = ctx->bs->cluster_sz / SPDK_BS_PAGE_SIZE; + ctx->bs->io_unit_size = ctx->super->io_unit_size; + rc = spdk_bit_array_resize(&ctx->bs->used_clusters, ctx->bs->total_clusters); + if (rc < 0) { + _spdk_bs_load_ctx_fail(seq, ctx, -ENOMEM); + return; + } + ctx->bs->md_start = ctx->super->md_start; + ctx->bs->md_len = ctx->super->md_len; + ctx->bs->total_data_clusters = ctx->bs->total_clusters - divide_round_up( + ctx->bs->md_start + ctx->bs->md_len, ctx->bs->pages_per_cluster); + ctx->bs->super_blob = ctx->super->super_blob; + memcpy(&ctx->bs->bstype, &ctx->super->bstype, sizeof(ctx->super->bstype)); + + if (ctx->super->used_blobid_mask_len == 0 || ctx->super->clean == 0) { + _spdk_bs_recover(seq, ctx); + } else { + _spdk_bs_load_read_used_pages(seq, ctx); + } +} + +void +spdk_bs_load(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, + spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_load_ctx *ctx; + struct spdk_bs_opts opts = {}; + int err; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Loading blobstore from dev %p\n", dev); + + if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "unsupported dev block length of %d\n", dev->blocklen); + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + if (o) { + opts = *o; + } else { + spdk_bs_opts_init(&opts); + } + + if (opts.max_md_ops == 0 || opts.max_channel_ops == 0) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + err = _spdk_bs_alloc(dev, &opts, &bs); + if (err) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, err); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->bs = bs; + ctx->is_load = true; + ctx->iter_cb_fn = opts.iter_cb_fn; + ctx->iter_cb_arg = opts.iter_cb_arg; + + /* Allocate memory for the super block */ + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; + cpl.u.bs_handle.cb_fn = cb_fn; + cpl.u.bs_handle.cb_arg = cb_arg; + cpl.u.bs_handle.bs = bs; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_dma_free(ctx->super); + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + /* Read the super block */ + spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)), + _spdk_bs_load_super_cpl, ctx); +} + +/* END spdk_bs_load */ + +/* START spdk_bs_dump */ + +struct spdk_bs_dump_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; + uint32_t cur_page; + struct spdk_blob_md_page *page; + spdk_bs_sequence_t *seq; + FILE *fp; + spdk_bs_dump_print_xattr print_xattr_fn; + char xattr_name[4096]; +}; + +static void +_spdk_bs_dump_finish(spdk_bs_sequence_t *seq, struct spdk_bs_dump_ctx *ctx, int bserrno) +{ + spdk_dma_free(ctx->super); + + /* + * We need to defer calling spdk_bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + ctx->bs->unload_err = bserrno; + memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + spdk_bs_sequence_finish(seq, 0); + _spdk_bs_free(ctx->bs); + free(ctx); +} + +static void _spdk_bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg); + +static void +_spdk_bs_dump_print_md_page(struct spdk_bs_dump_ctx *ctx) +{ + uint32_t page_idx = ctx->cur_page; + struct spdk_blob_md_page *page = ctx->page; + struct spdk_blob_md_descriptor *desc; + size_t cur_desc = 0; + uint32_t crc; + + fprintf(ctx->fp, "=========\n"); + fprintf(ctx->fp, "Metadata Page Index: %" PRIu32 " (0x%" PRIx32 ")\n", page_idx, page_idx); + fprintf(ctx->fp, "Blob ID: 0x%" PRIx64 "\n", page->id); + + crc = _spdk_blob_md_page_calc_crc(page); + fprintf(ctx->fp, "CRC: 0x%" PRIx32 " (%s)\n", page->crc, crc == page->crc ? "OK" : "Mismatch"); + + desc = (struct spdk_blob_md_descriptor *)page->descriptors; + while (cur_desc < sizeof(page->descriptors)) { + if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_PADDING) { + if (desc->length == 0) { + /* If padding and length are 0, this terminates the page */ + break; + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_EXTENT) { + struct spdk_blob_md_descriptor_extent *desc_extent; + unsigned int i; + + desc_extent = (struct spdk_blob_md_descriptor_extent *)desc; + + for (i = 0; i < desc_extent->length / sizeof(desc_extent->extents[0]); i++) { + if (desc_extent->extents[i].cluster_idx != 0) { + fprintf(ctx->fp, "Allocated Extent - Start: %" PRIu32, + desc_extent->extents[i].cluster_idx); + } else { + fprintf(ctx->fp, "Unallocated Extent - "); + } + fprintf(ctx->fp, " Length: %" PRIu32, desc_extent->extents[i].length); + fprintf(ctx->fp, "\n"); + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR) { + struct spdk_blob_md_descriptor_xattr *desc_xattr; + uint32_t i; + + desc_xattr = (struct spdk_blob_md_descriptor_xattr *)desc; + + if (desc_xattr->length != + sizeof(desc_xattr->name_length) + sizeof(desc_xattr->value_length) + + desc_xattr->name_length + desc_xattr->value_length) { + } + + memcpy(ctx->xattr_name, desc_xattr->name, desc_xattr->name_length); + ctx->xattr_name[desc_xattr->name_length] = '\0'; + fprintf(ctx->fp, "XATTR: name = \"%s\"\n", ctx->xattr_name); + fprintf(ctx->fp, " value = \""); + ctx->print_xattr_fn(ctx->fp, ctx->super->bstype.bstype, ctx->xattr_name, + (void *)((uintptr_t)desc_xattr->name + desc_xattr->name_length), + desc_xattr->value_length); + fprintf(ctx->fp, "\"\n"); + for (i = 0; i < desc_xattr->value_length; i++) { + if (i % 16 == 0) { + fprintf(ctx->fp, " "); + } + fprintf(ctx->fp, "%02" PRIx8 " ", *((uint8_t *)desc_xattr->name + desc_xattr->name_length + i)); + if ((i + 1) % 16 == 0) { + fprintf(ctx->fp, "\n"); + } + } + if (i % 16 != 0) { + fprintf(ctx->fp, "\n"); + } + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL) { + /* TODO */ + } else if (desc->type == SPDK_MD_DESCRIPTOR_TYPE_FLAGS) { + /* TODO */ + } else { + /* Error */ + } + /* Advance to the next descriptor */ + cur_desc += sizeof(*desc) + desc->length; + if (cur_desc + sizeof(*desc) > sizeof(page->descriptors)) { + break; + } + desc = (struct spdk_blob_md_descriptor *)((uintptr_t)page->descriptors + cur_desc); + } +} + +static void +_spdk_bs_dump_read_md_page_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + + if (bserrno != 0) { + _spdk_bs_dump_finish(seq, ctx, bserrno); + return; + } + + if (ctx->page->id != 0) { + _spdk_bs_dump_print_md_page(ctx); + } + + ctx->cur_page++; + + if (ctx->cur_page < ctx->super->md_len) { + _spdk_bs_dump_read_md_page(seq, cb_arg); + } else { + spdk_dma_free(ctx->page); + _spdk_bs_dump_finish(seq, ctx, 0); + } +} + +static void +_spdk_bs_dump_read_md_page(spdk_bs_sequence_t *seq, void *cb_arg) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + uint64_t lba; + + assert(ctx->cur_page < ctx->super->md_len); + lba = _spdk_bs_page_to_lba(ctx->bs, ctx->super->md_start + ctx->cur_page); + spdk_bs_sequence_read_dev(seq, ctx->page, lba, + _spdk_bs_byte_to_lba(ctx->bs, SPDK_BS_PAGE_SIZE), + _spdk_bs_dump_read_md_page_cpl, ctx); +} + +static void +_spdk_bs_dump_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_dump_ctx *ctx = cb_arg; + + fprintf(ctx->fp, "Signature: \"%.8s\" ", ctx->super->signature); + if (memcmp(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)) != 0) { + fprintf(ctx->fp, "(Mismatch)\n"); + _spdk_bs_dump_finish(seq, ctx, bserrno); + return; + } else { + fprintf(ctx->fp, "(OK)\n"); + } + fprintf(ctx->fp, "Version: %" PRIu32 "\n", ctx->super->version); + fprintf(ctx->fp, "CRC: 0x%x (%s)\n", ctx->super->crc, + (ctx->super->crc == _spdk_blob_md_page_calc_crc(ctx->super)) ? "OK" : "Mismatch"); + fprintf(ctx->fp, "Blobstore Type: %.*s\n", SPDK_BLOBSTORE_TYPE_LENGTH, ctx->super->bstype.bstype); + fprintf(ctx->fp, "Cluster Size: %" PRIu32 "\n", ctx->super->cluster_size); + fprintf(ctx->fp, "Super Blob ID: "); + if (ctx->super->super_blob == SPDK_BLOBID_INVALID) { + fprintf(ctx->fp, "(None)\n"); + } else { + fprintf(ctx->fp, "%" PRIu64 "\n", ctx->super->super_blob); + } + fprintf(ctx->fp, "Clean: %" PRIu32 "\n", ctx->super->clean); + fprintf(ctx->fp, "Used Metadata Page Mask Start: %" PRIu32 "\n", ctx->super->used_page_mask_start); + fprintf(ctx->fp, "Used Metadata Page Mask Length: %" PRIu32 "\n", ctx->super->used_page_mask_len); + fprintf(ctx->fp, "Used Cluster Mask Start: %" PRIu32 "\n", ctx->super->used_cluster_mask_start); + fprintf(ctx->fp, "Used Cluster Mask Length: %" PRIu32 "\n", ctx->super->used_cluster_mask_len); + fprintf(ctx->fp, "Used Blob ID Mask Start: %" PRIu32 "\n", ctx->super->used_blobid_mask_start); + fprintf(ctx->fp, "Used Blob ID Mask Length: %" PRIu32 "\n", ctx->super->used_blobid_mask_len); + fprintf(ctx->fp, "Metadata Start: %" PRIu32 "\n", ctx->super->md_start); + fprintf(ctx->fp, "Metadata Length: %" PRIu32 "\n", ctx->super->md_len); + + ctx->cur_page = 0; + ctx->page = spdk_dma_zmalloc(SPDK_BS_PAGE_SIZE, + SPDK_BS_PAGE_SIZE, + NULL); + if (!ctx->page) { + _spdk_bs_dump_finish(seq, ctx, -ENOMEM); + return; + } + _spdk_bs_dump_read_md_page(seq, cb_arg); +} + +void +spdk_bs_dump(struct spdk_bs_dev *dev, FILE *fp, spdk_bs_dump_print_xattr print_xattr_fn, + spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_dump_ctx *ctx; + struct spdk_bs_opts opts = {}; + int err; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Dumping blobstore from dev %p\n", dev); + + spdk_bs_opts_init(&opts); + + err = _spdk_bs_alloc(dev, &opts, &bs); + if (err) { + dev->destroy(dev); + cb_fn(cb_arg, err); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + _spdk_bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + ctx->fp = fp; + ctx->print_xattr_fn = print_xattr_fn; + + /* Allocate memory for the super block */ + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_dma_free(ctx->super); + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Read the super block */ + spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)), + _spdk_bs_dump_super_cpl, ctx); +} + +/* END spdk_bs_dump */ + +/* START spdk_bs_init */ + +struct spdk_bs_init_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; +}; + +static void +_spdk_bs_init_persist_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->super); + free(ctx); + + spdk_bs_sequence_finish(seq, bserrno); +} + +static void +_spdk_bs_init_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + + /* Write super block */ + spdk_bs_sequence_write_dev(seq, ctx->super, _spdk_bs_page_to_lba(ctx->bs, 0), + _spdk_bs_byte_to_lba(ctx->bs, sizeof(*ctx->super)), + _spdk_bs_init_persist_super_cpl, ctx); +} + +void +spdk_bs_init(struct spdk_bs_dev *dev, struct spdk_bs_opts *o, + spdk_bs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_init_ctx *ctx; + struct spdk_blob_store *bs; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + spdk_bs_batch_t *batch; + uint64_t num_md_lba; + uint64_t num_md_pages; + uint64_t num_md_clusters; + uint32_t i; + struct spdk_bs_opts opts = {}; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Initializing blobstore on dev %p\n", dev); + + if ((SPDK_BS_PAGE_SIZE % dev->blocklen) != 0) { + SPDK_ERRLOG("unsupported dev block length of %d\n", + dev->blocklen); + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + if (o) { + opts = *o; + } else { + spdk_bs_opts_init(&opts); + } + + if (_spdk_bs_opts_verify(&opts) != 0) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = _spdk_bs_alloc(dev, &opts, &bs); + if (rc) { + dev->destroy(dev); + cb_fn(cb_arg, NULL, rc); + return; + } + + if (opts.num_md_pages == SPDK_BLOB_OPTS_NUM_MD_PAGES) { + /* By default, allocate 1 page per cluster. + * Technically, this over-allocates metadata + * because more metadata will reduce the number + * of usable clusters. This can be addressed with + * more complex math in the future. + */ + bs->md_len = bs->total_clusters; + } else { + bs->md_len = opts.num_md_pages; + } + rc = spdk_bit_array_resize(&bs->used_md_pages, bs->md_len); + if (rc < 0) { + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + rc = spdk_bit_array_resize(&bs->used_blobids, bs->md_len); + if (rc < 0) { + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->bs = bs; + + /* Allocate memory for the super block */ + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + memcpy(ctx->super->signature, SPDK_BS_SUPER_BLOCK_SIG, + sizeof(ctx->super->signature)); + ctx->super->version = SPDK_BS_VERSION; + ctx->super->length = sizeof(*ctx->super); + ctx->super->super_blob = bs->super_blob; + ctx->super->clean = 0; + ctx->super->cluster_size = bs->cluster_sz; + ctx->super->io_unit_size = bs->io_unit_size; + memcpy(&ctx->super->bstype, &bs->bstype, sizeof(bs->bstype)); + + /* Calculate how many pages the metadata consumes at the front + * of the disk. + */ + + /* The super block uses 1 page */ + num_md_pages = 1; + + /* The used_md_pages mask requires 1 bit per metadata page, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_page_mask_start = num_md_pages; + ctx->super->used_page_mask_len = divide_round_up(sizeof(struct spdk_bs_md_mask) + + divide_round_up(bs->md_len, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_page_mask_len; + + /* The used_clusters mask requires 1 bit per cluster, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_cluster_mask_start = num_md_pages; + ctx->super->used_cluster_mask_len = divide_round_up(sizeof(struct spdk_bs_md_mask) + + divide_round_up(bs->total_clusters, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_cluster_mask_len; + + /* The used_blobids mask requires 1 bit per metadata page, rounded + * up to the nearest page, plus a header. + */ + ctx->super->used_blobid_mask_start = num_md_pages; + ctx->super->used_blobid_mask_len = divide_round_up(sizeof(struct spdk_bs_md_mask) + + divide_round_up(bs->md_len, 8), + SPDK_BS_PAGE_SIZE); + num_md_pages += ctx->super->used_blobid_mask_len; + + /* The metadata region size was chosen above */ + ctx->super->md_start = bs->md_start = num_md_pages; + ctx->super->md_len = bs->md_len; + num_md_pages += bs->md_len; + + num_md_lba = _spdk_bs_page_to_lba(bs, num_md_pages); + + ctx->super->size = dev->blockcnt * dev->blocklen; + + ctx->super->crc = _spdk_blob_md_page_calc_crc(ctx->super); + + num_md_clusters = divide_round_up(num_md_pages, bs->pages_per_cluster); + if (num_md_clusters > bs->total_clusters) { + SPDK_ERRLOG("Blobstore metadata cannot use more clusters than is available, " + "please decrease number of pages reserved for metadata " + "or increase cluster size.\n"); + spdk_dma_free(ctx->super); + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + /* Claim all of the clusters used by the metadata */ + for (i = 0; i < num_md_clusters; i++) { + _spdk_bs_claim_cluster(bs, i); + } + + bs->total_data_clusters = bs->num_free_clusters; + + cpl.type = SPDK_BS_CPL_TYPE_BS_HANDLE; + cpl.u.bs_handle.cb_fn = cb_fn; + cpl.u.bs_handle.cb_arg = cb_arg; + cpl.u.bs_handle.bs = bs; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_dma_free(ctx->super); + free(ctx); + _spdk_bs_free(bs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + batch = spdk_bs_sequence_to_batch(seq, _spdk_bs_init_trim_cpl, ctx); + + /* Clear metadata space */ + spdk_bs_batch_write_zeroes_dev(batch, 0, num_md_lba); + /* Trim data clusters */ + spdk_bs_batch_unmap_dev(batch, num_md_lba, ctx->bs->dev->blockcnt - num_md_lba); + + spdk_bs_batch_close(batch); +} + +/* END spdk_bs_init */ + +/* START spdk_bs_destroy */ + +static void +_spdk_bs_destroy_trim_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_init_ctx *ctx = cb_arg; + struct spdk_blob_store *bs = ctx->bs; + + /* + * We need to defer calling spdk_bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + bs->unload_err = bserrno; + memcpy(&bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + spdk_bs_sequence_finish(seq, bserrno); + + _spdk_bs_free(bs); + free(ctx); +} + +void +spdk_bs_destroy(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_init_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Destroying blobstore\n"); + + if (!TAILQ_EMPTY(&bs->blobs)) { + SPDK_ERRLOG("Blobstore still has open blobs\n"); + cb_fn(cb_arg, -EBUSY); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Write zeroes to the super block */ + spdk_bs_sequence_write_zeroes_dev(seq, + _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(struct spdk_bs_super_block)), + _spdk_bs_destroy_trim_cpl, ctx); +} + +/* END spdk_bs_destroy */ + +/* START spdk_bs_unload */ + +static void +_spdk_bs_unload_write_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->super); + + /* + * We need to defer calling spdk_bs_call_cpl() until after + * dev destruction, so tuck these away for later use. + */ + ctx->bs->unload_err = bserrno; + memcpy(&ctx->bs->unload_cpl, &seq->cpl, sizeof(struct spdk_bs_cpl)); + seq->cpl.type = SPDK_BS_CPL_TYPE_NONE; + + spdk_bs_sequence_finish(seq, bserrno); + + _spdk_bs_free(ctx->bs); + free(ctx); +} + +static void +_spdk_bs_unload_write_used_clusters_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->mask); + ctx->super->clean = 1; + + _spdk_bs_write_super(seq, ctx->bs, ctx->super, _spdk_bs_unload_write_super_cpl, ctx); +} + +static void +_spdk_bs_unload_write_used_blobids_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->mask); + ctx->mask = NULL; + + _spdk_bs_write_used_clusters(seq, cb_arg, _spdk_bs_unload_write_used_clusters_cpl); +} + +static void +_spdk_bs_unload_write_used_pages_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_load_ctx *ctx = cb_arg; + + spdk_dma_free(ctx->mask); + ctx->mask = NULL; + + _spdk_bs_write_used_blobids(seq, cb_arg, _spdk_bs_unload_write_used_blobids_cpl); +} + +static void +_spdk_bs_unload_read_super_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + _spdk_bs_write_used_md(seq, cb_arg, _spdk_bs_unload_write_used_pages_cpl); +} + +void +spdk_bs_unload(struct spdk_blob_store *bs, spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_load_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blobstore\n"); + + if (!TAILQ_EMPTY(&bs->blobs)) { + SPDK_ERRLOG("Blobstore still has open blobs\n"); + cb_fn(cb_arg, -EBUSY); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + ctx->is_load = false; + + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_dma_free(ctx->super); + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Read super block */ + spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)), + _spdk_bs_unload_read_super_cpl, ctx); +} + +/* END spdk_bs_unload */ + +/* START spdk_bs_set_super */ + +struct spdk_bs_set_super_ctx { + struct spdk_blob_store *bs; + struct spdk_bs_super_block *super; +}; + +static void +_spdk_bs_set_super_write_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_set_super_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Unable to write to super block of blobstore\n"); + } + + spdk_dma_free(ctx->super); + + spdk_bs_sequence_finish(seq, bserrno); + + free(ctx); +} + +static void +_spdk_bs_set_super_read_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_bs_set_super_ctx *ctx = cb_arg; + + if (bserrno != 0) { + SPDK_ERRLOG("Unable to read super block of blobstore\n"); + spdk_dma_free(ctx->super); + spdk_bs_sequence_finish(seq, bserrno); + free(ctx); + return; + } + + _spdk_bs_write_super(seq, ctx->bs, ctx->super, _spdk_bs_set_super_write_cpl, ctx); +} + +void +spdk_bs_set_super(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_bs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_bs_set_super_ctx *ctx; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Setting super blob id on blobstore\n"); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->bs = bs; + + ctx->super = spdk_dma_zmalloc(sizeof(*ctx->super), 0x1000, NULL); + if (!ctx->super) { + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BS_BASIC; + cpl.u.bs_basic.cb_fn = cb_fn; + cpl.u.bs_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + spdk_dma_free(ctx->super); + free(ctx); + cb_fn(cb_arg, -ENOMEM); + return; + } + + bs->super_blob = blobid; + + /* Read super block */ + spdk_bs_sequence_read_dev(seq, ctx->super, _spdk_bs_page_to_lba(bs, 0), + _spdk_bs_byte_to_lba(bs, sizeof(*ctx->super)), + _spdk_bs_set_super_read_cpl, ctx); +} + +/* END spdk_bs_set_super */ + +void +spdk_bs_get_super(struct spdk_blob_store *bs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + if (bs->super_blob == SPDK_BLOBID_INVALID) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOENT); + } else { + cb_fn(cb_arg, bs->super_blob, 0); + } +} + +uint64_t +spdk_bs_get_cluster_size(struct spdk_blob_store *bs) +{ + return bs->cluster_sz; +} + +uint64_t +spdk_bs_get_page_size(struct spdk_blob_store *bs) +{ + return SPDK_BS_PAGE_SIZE; +} + +uint64_t +spdk_bs_get_io_unit_size(struct spdk_blob_store *bs) +{ + return bs->io_unit_size; +} + +uint64_t +spdk_bs_free_cluster_count(struct spdk_blob_store *bs) +{ + return bs->num_free_clusters; +} + +uint64_t +spdk_bs_total_data_cluster_count(struct spdk_blob_store *bs) +{ + return bs->total_data_clusters; +} + +static int +spdk_bs_register_md_thread(struct spdk_blob_store *bs) +{ + bs->md_channel = spdk_get_io_channel(bs); + if (!bs->md_channel) { + SPDK_ERRLOG("Failed to get IO channel.\n"); + return -1; + } + + return 0; +} + +static int +spdk_bs_unregister_md_thread(struct spdk_blob_store *bs) +{ + spdk_put_io_channel(bs->md_channel); + + return 0; +} + +spdk_blob_id spdk_blob_get_id(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return blob->id; +} + +uint64_t spdk_blob_get_num_pages(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return _spdk_bs_cluster_to_page(blob->bs, blob->active.num_clusters); +} + +uint64_t spdk_blob_get_num_io_units(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return spdk_blob_get_num_pages(blob) * _spdk_bs_io_unit_per_page(blob->bs); +} + +uint64_t spdk_blob_get_num_clusters(struct spdk_blob *blob) +{ + assert(blob != NULL); + + return blob->active.num_clusters; +} + +/* START spdk_bs_create_blob */ + +static void +_spdk_bs_create_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + _spdk_blob_free(blob); + + spdk_bs_sequence_finish(seq, bserrno); +} + +static int +_spdk_blob_set_xattrs(struct spdk_blob *blob, const struct spdk_blob_xattr_opts *xattrs, + bool internal) +{ + uint64_t i; + size_t value_len = 0; + int rc; + const void *value = NULL; + if (xattrs->count > 0 && xattrs->get_value == NULL) { + return -EINVAL; + } + for (i = 0; i < xattrs->count; i++) { + xattrs->get_value(xattrs->ctx, xattrs->names[i], &value, &value_len); + if (value == NULL || value_len == 0) { + return -EINVAL; + } + rc = _spdk_blob_set_xattr(blob, xattrs->names[i], value, value_len, internal); + if (rc < 0) { + return rc; + } + } + return 0; +} + +static void +_spdk_blob_set_thin_provision(struct spdk_blob *blob) +{ + _spdk_blob_verify_md_op(blob); + blob->invalid_flags |= SPDK_BLOB_THIN_PROV; + blob->state = SPDK_BLOB_STATE_DIRTY; +} + +static void +_spdk_bs_create_blob(struct spdk_blob_store *bs, + const struct spdk_blob_opts *opts, + const struct spdk_blob_xattr_opts *internal_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_blob *blob; + uint32_t page_idx; + struct spdk_bs_cpl cpl; + struct spdk_blob_opts opts_default; + struct spdk_blob_xattr_opts internal_xattrs_default; + spdk_bs_sequence_t *seq; + spdk_blob_id id; + int rc; + + assert(spdk_get_thread() == bs->md_thread); + + page_idx = spdk_bit_array_find_first_clear(bs->used_md_pages, 0); + if (page_idx == UINT32_MAX) { + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + spdk_bit_array_set(bs->used_blobids, page_idx); + spdk_bit_array_set(bs->used_md_pages, page_idx); + + id = _spdk_bs_page_to_blobid(page_idx); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Creating blob with id %lu at page %u\n", id, page_idx); + + blob = _spdk_blob_alloc(bs, id); + if (!blob) { + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + + if (!opts) { + spdk_blob_opts_init(&opts_default); + opts = &opts_default; + } + if (!internal_xattrs) { + _spdk_blob_xattrs_init(&internal_xattrs_default); + internal_xattrs = &internal_xattrs_default; + } + + rc = _spdk_blob_set_xattrs(blob, &opts->xattrs, false); + if (rc < 0) { + _spdk_blob_free(blob); + cb_fn(cb_arg, 0, rc); + return; + } + + rc = _spdk_blob_set_xattrs(blob, internal_xattrs, true); + if (rc < 0) { + _spdk_blob_free(blob); + cb_fn(cb_arg, 0, rc); + return; + } + + if (opts->thin_provision) { + _spdk_blob_set_thin_provision(blob); + } + + rc = _spdk_blob_resize(blob, opts->num_clusters); + if (rc < 0) { + _spdk_blob_free(blob); + cb_fn(cb_arg, 0, rc); + return; + } + cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + cpl.u.blobid.cb_fn = cb_fn; + cpl.u.blobid.cb_arg = cb_arg; + cpl.u.blobid.blobid = blob->id; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + _spdk_blob_free(blob); + cb_fn(cb_arg, 0, -ENOMEM); + return; + } + + _spdk_blob_persist(seq, blob, _spdk_bs_create_blob_cpl, blob); +} + +void spdk_bs_create_blob(struct spdk_blob_store *bs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + _spdk_bs_create_blob(bs, NULL, NULL, cb_fn, cb_arg); +} + +void spdk_bs_create_blob_ext(struct spdk_blob_store *bs, const struct spdk_blob_opts *opts, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + _spdk_bs_create_blob(bs, opts, NULL, cb_fn, cb_arg); +} + +/* END spdk_bs_create_blob */ + +/* START blob_cleanup */ + +struct spdk_clone_snapshot_ctx { + struct spdk_bs_cpl cpl; + int bserrno; + bool frozen; + + struct spdk_io_channel *channel; + + /* Current cluster for inflate operation */ + uint64_t cluster; + + /* For inflation force allocation of all unallocated clusters and remove + * thin-provisioning. Otherwise only decouple parent and keep clone thin. */ + bool allocate_all; + + struct { + spdk_blob_id id; + struct spdk_blob *blob; + } original; + struct { + spdk_blob_id id; + struct spdk_blob *blob; + } new; + + /* xattrs specified for snapshot/clones only. They have no impact on + * the original blobs xattrs. */ + const struct spdk_blob_xattr_opts *xattrs; +}; + +static void +_spdk_bs_clone_snapshot_cleanup_finish(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = cb_arg; + struct spdk_bs_cpl *cpl = &ctx->cpl; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + switch (cpl->type) { + case SPDK_BS_CPL_TYPE_BLOBID: + cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, cpl->u.blobid.blobid, ctx->bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_BASIC: + cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, ctx->bserrno); + break; + default: + SPDK_UNREACHABLE(); + break; + } + + free(ctx); +} + +static void +_spdk_bs_snapshot_unfreeze_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Unfreeze error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + ctx->original.id = origblob->id; + spdk_blob_close(origblob, _spdk_bs_clone_snapshot_cleanup_finish, ctx); +} + +static void +_spdk_bs_clone_snapshot_origblob_cleanup(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + if (ctx->frozen) { + /* Unfreeze any outstanding I/O */ + _spdk_blob_unfreeze_io(origblob, _spdk_bs_snapshot_unfreeze_cpl, ctx); + } else { + _spdk_bs_snapshot_unfreeze_cpl(ctx, 0); + } + +} + +static void +_spdk_bs_clone_snapshot_newblob_cleanup(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + if (ctx->bserrno != 0) { + SPDK_ERRLOG("Cleanup error %d\n", bserrno); + } else { + ctx->bserrno = bserrno; + } + } + + ctx->new.id = newblob->id; + spdk_blob_close(newblob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx); +} + +/* END blob_cleanup */ + +/* START spdk_bs_create_snapshot */ + +static void +_spdk_bs_snapshot_origblob_sync_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + + /* Remove metadata descriptor SNAPSHOT_IN_PROGRESS */ + bserrno = _spdk_blob_remove_xattr(newblob, SNAPSHOT_IN_PROGRESS, true); + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + _spdk_bs_blob_list_add(ctx->original.blob); + + spdk_blob_set_read_only(newblob); + + /* sync snapshot metadata */ + spdk_blob_sync_md(newblob, _spdk_bs_clone_snapshot_origblob_cleanup, cb_arg); +} + +static void +_spdk_bs_snapshot_newblob_sync_cpl(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = ctx->new.blob; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + + /* Set internal xattr for snapshot id */ + bserrno = _spdk_blob_set_xattr(origblob, BLOB_SNAPSHOT, &newblob->id, sizeof(spdk_blob_id), true); + if (bserrno != 0) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + + _spdk_bs_blob_list_remove(origblob); + origblob->parent_id = newblob->id; + + /* Create new back_bs_dev for snapshot */ + origblob->back_bs_dev = spdk_bs_create_blob_bs_dev(newblob); + if (origblob->back_bs_dev == NULL) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, -EINVAL); + return; + } + + /* set clone blob as thin provisioned */ + _spdk_blob_set_thin_provision(origblob); + + _spdk_bs_blob_list_add(newblob); + + /* Zero out origblob cluster map */ + memset(origblob->active.clusters, 0, + origblob->active.num_clusters * sizeof(origblob->active.clusters)); + + /* sync clone metadata */ + spdk_blob_sync_md(origblob, _spdk_bs_snapshot_origblob_sync_cpl, ctx); +} + +static void +_spdk_bs_snapshot_freeze_cpl(void *cb_arg, int rc) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = ctx->new.blob; + int bserrno; + + if (rc != 0) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, rc); + return; + } + + ctx->frozen = true; + + /* set new back_bs_dev for snapshot */ + newblob->back_bs_dev = origblob->back_bs_dev; + /* Set invalid flags from origblob */ + newblob->invalid_flags = origblob->invalid_flags; + + /* inherit parent from original blob if set */ + newblob->parent_id = origblob->parent_id; + if (origblob->parent_id != SPDK_BLOBID_INVALID) { + /* Set internal xattr for snapshot id */ + bserrno = _spdk_blob_set_xattr(newblob, BLOB_SNAPSHOT, + &origblob->parent_id, sizeof(spdk_blob_id), true); + if (bserrno != 0) { + _spdk_bs_clone_snapshot_newblob_cleanup(ctx, bserrno); + return; + } + } + + /* Copy cluster map to snapshot */ + memcpy(newblob->active.clusters, origblob->active.clusters, + origblob->active.num_clusters * sizeof(origblob->active.clusters)); + + /* sync snapshot metadata */ + spdk_blob_sync_md(newblob, _spdk_bs_snapshot_newblob_sync_cpl, ctx); +} + +static void +_spdk_bs_snapshot_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + struct spdk_blob *newblob = _blob; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + ctx->new.blob = newblob; + + _spdk_blob_freeze_io(origblob, _spdk_bs_snapshot_freeze_cpl, ctx); +} + +static void +_spdk_bs_snapshot_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *origblob = ctx->original.blob; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + ctx->new.id = blobid; + ctx->cpl.u.blobid.blobid = blobid; + + spdk_bs_open_blob(origblob->bs, ctx->new.id, _spdk_bs_snapshot_newblob_open_cpl, ctx); +} + + +static void +_spdk_bs_xattr_snapshot(void *arg, const char *name, + const void **value, size_t *value_len) +{ + assert(strncmp(name, SNAPSHOT_IN_PROGRESS, sizeof(SNAPSHOT_IN_PROGRESS)) == 0); + + struct spdk_blob *blob = (struct spdk_blob *)arg; + *value = &blob->id; + *value_len = sizeof(blob->id); +} + +static void +_spdk_bs_snapshot_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob_opts opts; + struct spdk_blob_xattr_opts internal_xattrs; + char *xattrs_names[] = { SNAPSHOT_IN_PROGRESS }; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + + ctx->original.blob = _blob; + + if (_blob->data_ro || _blob->md_ro) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Cannot create snapshot from read only blob with id %lu\n", + _blob->id); + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); + return; + } + + spdk_blob_opts_init(&opts); + _spdk_blob_xattrs_init(&internal_xattrs); + + /* Change the size of new blob to the same as in original blob, + * but do not allocate clusters */ + opts.thin_provision = true; + opts.num_clusters = spdk_blob_get_num_clusters(_blob); + + /* If there are any xattrs specified for snapshot, set them now */ + if (ctx->xattrs) { + memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); + } + /* Set internal xattr SNAPSHOT_IN_PROGRESS */ + internal_xattrs.count = 1; + internal_xattrs.ctx = _blob; + internal_xattrs.names = xattrs_names; + internal_xattrs.get_value = _spdk_bs_xattr_snapshot; + + _spdk_bs_create_blob(_blob->bs, &opts, &internal_xattrs, + _spdk_bs_snapshot_newblob_create_cpl, ctx); +} + +void spdk_bs_create_snapshot(struct spdk_blob_store *bs, spdk_blob_id blobid, + const struct spdk_blob_xattr_opts *snapshot_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); + return; + } + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + ctx->cpl.u.blobid.cb_fn = cb_fn; + ctx->cpl.u.blobid.cb_arg = cb_arg; + ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; + ctx->bserrno = 0; + ctx->frozen = false; + ctx->original.id = blobid; + ctx->xattrs = snapshot_xattrs; + + spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_snapshot_origblob_open_cpl, ctx); +} +/* END spdk_bs_create_snapshot */ + +/* START spdk_bs_create_clone */ + +static void +_spdk_bs_xattr_clone(void *arg, const char *name, + const void **value, size_t *value_len) +{ + assert(strncmp(name, BLOB_SNAPSHOT, sizeof(BLOB_SNAPSHOT)) == 0); + + struct spdk_blob *blob = (struct spdk_blob *)arg; + *value = &blob->id; + *value_len = sizeof(blob->id); +} + +static void +_spdk_bs_clone_newblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *clone = _blob; + + ctx->new.blob = clone; + _spdk_bs_blob_list_add(clone); + + spdk_blob_close(clone, _spdk_bs_clone_snapshot_origblob_cleanup, ctx); +} + +static void +_spdk_bs_clone_newblob_create_cpl(void *cb_arg, spdk_blob_id blobid, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + + ctx->cpl.u.blobid.blobid = blobid; + spdk_bs_open_blob(ctx->original.blob->bs, blobid, _spdk_bs_clone_newblob_open_cpl, ctx); +} + +static void +_spdk_bs_clone_origblob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob_opts opts; + struct spdk_blob_xattr_opts internal_xattrs; + char *xattr_names[] = { BLOB_SNAPSHOT }; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + + ctx->original.blob = _blob; + + if (!_blob->data_ro || !_blob->md_ro) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Clone not from read-only blob\n"); + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); + return; + } + + spdk_blob_opts_init(&opts); + _spdk_blob_xattrs_init(&internal_xattrs); + + opts.thin_provision = true; + opts.num_clusters = spdk_blob_get_num_clusters(_blob); + if (ctx->xattrs) { + memcpy(&opts.xattrs, ctx->xattrs, sizeof(*ctx->xattrs)); + } + + /* Set internal xattr BLOB_SNAPSHOT */ + internal_xattrs.count = 1; + internal_xattrs.ctx = _blob; + internal_xattrs.names = xattr_names; + internal_xattrs.get_value = _spdk_bs_xattr_clone; + + _spdk_bs_create_blob(_blob->bs, &opts, &internal_xattrs, + _spdk_bs_clone_newblob_create_cpl, ctx); +} + +void spdk_bs_create_clone(struct spdk_blob_store *bs, spdk_blob_id blobid, + const struct spdk_blob_xattr_opts *clone_xattrs, + spdk_blob_op_with_id_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, SPDK_BLOBID_INVALID, -ENOMEM); + return; + } + + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOBID; + ctx->cpl.u.blobid.cb_fn = cb_fn; + ctx->cpl.u.blobid.cb_arg = cb_arg; + ctx->cpl.u.blobid.blobid = SPDK_BLOBID_INVALID; + ctx->bserrno = 0; + ctx->xattrs = clone_xattrs; + ctx->original.id = blobid; + + spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_clone_origblob_open_cpl, ctx); +} + +/* END spdk_bs_create_clone */ + +/* START spdk_bs_inflate_blob */ + +static void +_spdk_bs_inflate_blob_set_parent_cpl(void *cb_arg, struct spdk_blob *_parent, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + assert(_parent != NULL); + + _spdk_bs_blob_list_remove(_blob); + _blob->parent_id = _parent->id; + _spdk_blob_set_xattr(_blob, BLOB_SNAPSHOT, &_blob->parent_id, + sizeof(spdk_blob_id), true); + + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = spdk_bs_create_blob_bs_dev(_parent); + _spdk_bs_blob_list_add(_blob); + + spdk_blob_sync_md(_blob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx); +} + +static void +_spdk_bs_inflate_blob_done(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + struct spdk_blob *_parent; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + if (ctx->allocate_all) { + /* remove thin provisioning */ + _spdk_bs_blob_list_remove(_blob); + _spdk_blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); + _blob->invalid_flags = _blob->invalid_flags & ~SPDK_BLOB_THIN_PROV; + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = NULL; + _blob->parent_id = SPDK_BLOBID_INVALID; + } else { + _parent = ((struct spdk_blob_bs_dev *)(_blob->back_bs_dev))->blob; + if (_parent->parent_id != SPDK_BLOBID_INVALID) { + /* We must change the parent of the inflated blob */ + spdk_bs_open_blob(_blob->bs, _parent->parent_id, + _spdk_bs_inflate_blob_set_parent_cpl, ctx); + return; + } + + _spdk_bs_blob_list_remove(_blob); + _spdk_blob_remove_xattr(_blob, BLOB_SNAPSHOT, true); + _blob->parent_id = SPDK_BLOBID_INVALID; + _blob->back_bs_dev->destroy(_blob->back_bs_dev); + _blob->back_bs_dev = spdk_bs_create_zeroes_dev(); + } + + _blob->state = SPDK_BLOB_STATE_DIRTY; + spdk_blob_sync_md(_blob, _spdk_bs_clone_snapshot_origblob_cleanup, ctx); +} + +/* Check if cluster needs allocation */ +static inline bool +_spdk_bs_cluster_needs_allocation(struct spdk_blob *blob, uint64_t cluster, bool allocate_all) +{ + struct spdk_blob_bs_dev *b; + + assert(blob != NULL); + + if (blob->active.clusters[cluster] != 0) { + /* Cluster is already allocated */ + return false; + } + + if (blob->parent_id == SPDK_BLOBID_INVALID) { + /* Blob have no parent blob */ + return allocate_all; + } + + b = (struct spdk_blob_bs_dev *)blob->back_bs_dev; + return (allocate_all || b->blob->active.clusters[cluster] != 0); +} + +static void +_spdk_bs_inflate_blob_touch_next(void *cb_arg, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + struct spdk_blob *_blob = ctx->original.blob; + uint64_t offset; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, bserrno); + return; + } + + for (; ctx->cluster < _blob->active.num_clusters; ctx->cluster++) { + if (_spdk_bs_cluster_needs_allocation(_blob, ctx->cluster, ctx->allocate_all)) { + break; + } + } + + if (ctx->cluster < _blob->active.num_clusters) { + offset = _spdk_bs_cluster_to_lba(_blob->bs, ctx->cluster); + + /* We may safely increment a cluster before write */ + ctx->cluster++; + + /* Use zero length write to touch a cluster */ + spdk_blob_io_write(_blob, ctx->channel, NULL, offset, 0, + _spdk_bs_inflate_blob_touch_next, ctx); + } else { + _spdk_bs_inflate_blob_done(cb_arg, bserrno); + } +} + +static void +_spdk_bs_inflate_blob_open_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_clone_snapshot_ctx *ctx = (struct spdk_clone_snapshot_ctx *)cb_arg; + uint64_t lfc; /* lowest free cluster */ + uint64_t i; + + if (bserrno != 0) { + _spdk_bs_clone_snapshot_cleanup_finish(ctx, bserrno); + return; + } + ctx->original.blob = _blob; + + if (!ctx->allocate_all && _blob->parent_id == SPDK_BLOBID_INVALID) { + /* This blob have no parent, so we cannot decouple it. */ + SPDK_ERRLOG("Cannot decouple parent of blob with no parent.\n"); + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, -EINVAL); + return; + } + + if (spdk_blob_is_thin_provisioned(_blob) == false) { + /* This is not thin provisioned blob. No need to inflate. */ + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, 0); + return; + } + + /* Do two passes - one to verify that we can obtain enough clusters + * and another to actually claim them. + */ + lfc = 0; + for (i = 0; i < _blob->active.num_clusters; i++) { + if (_spdk_bs_cluster_needs_allocation(_blob, i, ctx->allocate_all)) { + lfc = spdk_bit_array_find_first_clear(_blob->bs->used_clusters, lfc); + if (lfc == UINT32_MAX) { + /* No more free clusters. Cannot satisfy the request */ + _spdk_bs_clone_snapshot_origblob_cleanup(ctx, -ENOSPC); + return; + } + lfc++; + } + } + + ctx->cluster = 0; + _spdk_bs_inflate_blob_touch_next(ctx, 0); +} + +static void +_spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, bool allocate_all, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_clone_snapshot_ctx *ctx = calloc(1, sizeof(*ctx)); + + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + ctx->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + ctx->cpl.u.bs_basic.cb_fn = cb_fn; + ctx->cpl.u.bs_basic.cb_arg = cb_arg; + ctx->bserrno = 0; + ctx->original.id = blobid; + ctx->channel = channel; + ctx->allocate_all = allocate_all; + + spdk_bs_open_blob(bs, ctx->original.id, _spdk_bs_inflate_blob_open_cpl, ctx); +} + +void +spdk_bs_inflate_blob(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_bs_inflate_blob(bs, channel, blobid, true, cb_fn, cb_arg); +} + +void +spdk_bs_blob_decouple_parent(struct spdk_blob_store *bs, struct spdk_io_channel *channel, + spdk_blob_id blobid, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_bs_inflate_blob(bs, channel, blobid, false, cb_fn, cb_arg); +} +/* END spdk_bs_inflate_blob */ + +/* START spdk_blob_resize */ +struct spdk_bs_resize_ctx { + spdk_blob_op_complete cb_fn; + void *cb_arg; + struct spdk_blob *blob; + uint64_t sz; + int rc; +}; + +static void +_spdk_bs_resize_unfreeze_cpl(void *cb_arg, int rc) +{ + struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; + + if (rc != 0) { + SPDK_ERRLOG("Unfreeze failed, rc=%d\n", rc); + } + + if (ctx->rc != 0) { + SPDK_ERRLOG("Unfreeze failed, ctx->rc=%d\n", ctx->rc); + rc = ctx->rc; + } + + ctx->blob->resize_in_progress = false; + + ctx->cb_fn(ctx->cb_arg, rc); + free(ctx); +} + +static void +_spdk_bs_resize_freeze_cpl(void *cb_arg, int rc) +{ + struct spdk_bs_resize_ctx *ctx = (struct spdk_bs_resize_ctx *)cb_arg; + + if (rc != 0) { + ctx->blob->resize_in_progress = false; + ctx->cb_fn(ctx->cb_arg, rc); + free(ctx); + return; + } + + ctx->rc = _spdk_blob_resize(ctx->blob, ctx->sz); + + _spdk_blob_unfreeze_io(ctx->blob, _spdk_bs_resize_unfreeze_cpl, ctx); +} + +void +spdk_blob_resize(struct spdk_blob *blob, uint64_t sz, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_resize_ctx *ctx; + + _spdk_blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Resizing blob %lu to %lu clusters\n", blob->id, sz); + + if (blob->md_ro) { + cb_fn(cb_arg, -EPERM); + return; + } + + if (sz == blob->active.num_clusters) { + cb_fn(cb_arg, 0); + return; + } + + if (blob->resize_in_progress) { + cb_fn(cb_arg, -EBUSY); + return; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + blob->resize_in_progress = true; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + ctx->blob = blob; + ctx->sz = sz; + _spdk_blob_freeze_io(blob, _spdk_bs_resize_freeze_cpl, ctx); +} + +/* END spdk_blob_resize */ + + +/* START spdk_bs_delete_blob */ + +static void +_spdk_bs_delete_close_cpl(void *cb_arg, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + + spdk_bs_sequence_finish(seq, bserrno); +} + +static void +_spdk_bs_delete_persist_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno != 0) { + /* + * We already removed this blob from the blobstore tailq, so + * we need to free it here since this is the last reference + * to it. + */ + _spdk_blob_free(blob); + _spdk_bs_delete_close_cpl(seq, bserrno); + return; + } + + /* + * This will immediately decrement the ref_count and call + * the completion routine since the metadata state is clean. + * By calling spdk_blob_close, we reduce the number of call + * points into code that touches the blob->open_ref count + * and the blobstore's blob list. + */ + spdk_blob_close(blob, _spdk_bs_delete_close_cpl, seq); +} + +static void +_spdk_bs_delete_open_cpl(void *cb_arg, struct spdk_blob *blob, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + uint32_t page_num; + + if (bserrno != 0) { + spdk_bs_sequence_finish(seq, bserrno); + return; + } + + _spdk_blob_verify_md_op(blob); + + if (blob->open_ref > 1) { + /* + * Someone has this blob open (besides this delete context). + * Decrement the ref count directly and return -EBUSY. + */ + blob->open_ref--; + spdk_bs_sequence_finish(seq, -EBUSY); + return; + } + + bserrno = _spdk_bs_blob_list_remove(blob); + if (bserrno != 0) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Remove blob #%" PRIu64 " from a list\n", blob->id); + spdk_bs_sequence_finish(seq, bserrno); + return; + } + + /* + * Remove the blob from the blob_store list now, to ensure it does not + * get returned after this point by _spdk_blob_lookup(). + */ + TAILQ_REMOVE(&blob->bs->blobs, blob, link); + page_num = _spdk_bs_blobid_to_page(blob->id); + spdk_bit_array_clear(blob->bs->used_blobids, page_num); + blob->state = SPDK_BLOB_STATE_DIRTY; + blob->active.num_pages = 0; + _spdk_blob_resize(blob, 0); + + _spdk_blob_persist(seq, blob, _spdk_bs_delete_persist_cpl, blob); +} + +void +spdk_bs_delete_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + struct spdk_blob_list *snapshot_entry = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Deleting blob %lu\n", blobid); + + assert(spdk_get_thread() == bs->md_thread); + + /* Check if this is a snapshot with clones */ + TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { + if (snapshot_entry->id == blobid) { + break; + } + } + if (snapshot_entry != NULL) { + /* If snapshot have clones, we cannot remove it */ + if (!TAILQ_EMPTY(&snapshot_entry->clones)) { + SPDK_ERRLOG("Cannot remove snapshot with clones\n"); + cb_fn(cb_arg, -EBUSY); + return; + } + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + spdk_bs_open_blob(bs, blobid, _spdk_bs_delete_open_cpl, seq); +} + +/* END spdk_bs_delete_blob */ + +/* START spdk_bs_open_blob */ + +static void +_spdk_bs_open_blob_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + /* If the blob have crc error, we just return NULL. */ + if (blob == NULL) { + seq->cpl.u.blob_handle.blob = NULL; + spdk_bs_sequence_finish(seq, bserrno); + return; + } + + blob->open_ref++; + + TAILQ_INSERT_HEAD(&blob->bs->blobs, blob, link); + + spdk_bs_sequence_finish(seq, bserrno); +} + +void spdk_bs_open_blob(struct spdk_blob_store *bs, spdk_blob_id blobid, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_blob *blob; + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + uint32_t page_num; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Opening blob %lu\n", blobid); + assert(spdk_get_thread() == bs->md_thread); + + page_num = _spdk_bs_blobid_to_page(blobid); + if (spdk_bit_array_get(bs->used_blobids, page_num) == false) { + /* Invalid blobid */ + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + blob = _spdk_blob_lookup(bs, blobid); + if (blob) { + blob->open_ref++; + cb_fn(cb_arg, blob, 0); + return; + } + + blob = _spdk_blob_alloc(bs, blobid); + if (!blob) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_HANDLE; + cpl.u.blob_handle.cb_fn = cb_fn; + cpl.u.blob_handle.cb_arg = cb_arg; + cpl.u.blob_handle.blob = blob; + + seq = spdk_bs_sequence_start(bs->md_channel, &cpl); + if (!seq) { + _spdk_blob_free(blob); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + _spdk_blob_load(seq, blob, _spdk_bs_open_blob_cpl, blob); +} +/* END spdk_bs_open_blob */ + +/* START spdk_blob_set_read_only */ +int spdk_blob_set_read_only(struct spdk_blob *blob) +{ + _spdk_blob_verify_md_op(blob); + + blob->data_ro_flags |= SPDK_BLOB_READ_ONLY; + + blob->state = SPDK_BLOB_STATE_DIRTY; + return 0; +} +/* END spdk_blob_set_read_only */ + +/* START spdk_blob_sync_md */ + +static void +_spdk_blob_sync_md_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno == 0 && (blob->data_ro_flags & SPDK_BLOB_READ_ONLY)) { + blob->data_ro = true; + blob->md_ro = true; + } + + spdk_bs_sequence_finish(seq, bserrno); +} + +static void +_spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(blob->bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + _spdk_blob_persist(seq, blob, _spdk_blob_sync_md_cpl, blob); +} + +void +spdk_blob_sync_md(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Syncing blob %lu\n", blob->id); + + if (blob->md_ro) { + assert(blob->state == SPDK_BLOB_STATE_CLEAN); + cb_fn(cb_arg, 0); + return; + } + + _spdk_blob_sync_md(blob, cb_fn, cb_arg); +} + +/* END spdk_blob_sync_md */ + +struct spdk_blob_insert_cluster_ctx { + struct spdk_thread *thread; + struct spdk_blob *blob; + uint32_t cluster_num; /* cluster index in blob */ + uint32_t cluster; /* cluster on disk */ + int rc; + spdk_blob_op_complete cb_fn; + void *cb_arg; +}; + +static void +_spdk_blob_insert_cluster_msg_cpl(void *arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + + ctx->cb_fn(ctx->cb_arg, ctx->rc); + free(ctx); +} + +static void +_spdk_blob_insert_cluster_msg_cb(void *arg, int bserrno) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + + ctx->rc = bserrno; + spdk_thread_send_msg(ctx->thread, _spdk_blob_insert_cluster_msg_cpl, ctx); +} + +static void +_spdk_blob_insert_cluster_msg(void *arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx = arg; + + ctx->rc = _spdk_blob_insert_cluster(ctx->blob, ctx->cluster_num, ctx->cluster); + if (ctx->rc != 0) { + spdk_thread_send_msg(ctx->thread, _spdk_blob_insert_cluster_msg_cpl, ctx); + return; + } + + ctx->blob->state = SPDK_BLOB_STATE_DIRTY; + _spdk_blob_sync_md(ctx->blob, _spdk_blob_insert_cluster_msg_cb, ctx); +} + +static void +_spdk_blob_insert_cluster_on_md_thread(struct spdk_blob *blob, uint32_t cluster_num, + uint64_t cluster, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob_insert_cluster_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->thread = spdk_get_thread(); + ctx->blob = blob; + ctx->cluster_num = cluster_num; + ctx->cluster = cluster; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_thread_send_msg(blob->bs->md_thread, _spdk_blob_insert_cluster_msg, ctx); +} + +/* START spdk_blob_close */ + +static void +_spdk_blob_close_cpl(spdk_bs_sequence_t *seq, void *cb_arg, int bserrno) +{ + struct spdk_blob *blob = cb_arg; + + if (bserrno == 0) { + blob->open_ref--; + if (blob->open_ref == 0) { + /* + * Blobs with active.num_pages == 0 are deleted blobs. + * these blobs are removed from the blob_store list + * when the deletion process starts - so don't try to + * remove them again. + */ + if (blob->active.num_pages > 0) { + TAILQ_REMOVE(&blob->bs->blobs, blob, link); + } + _spdk_blob_free(blob); + } + } + + spdk_bs_sequence_finish(seq, bserrno); +} + +void spdk_blob_close(struct spdk_blob *blob, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_cpl cpl; + spdk_bs_sequence_t *seq; + + _spdk_blob_verify_md_op(blob); + + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "Closing blob %lu\n", blob->id); + + if (blob->open_ref == 0) { + cb_fn(cb_arg, -EBADF); + return; + } + + cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + cpl.u.blob_basic.cb_fn = cb_fn; + cpl.u.blob_basic.cb_arg = cb_arg; + + seq = spdk_bs_sequence_start(blob->bs->md_channel, &cpl); + if (!seq) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + /* Sync metadata */ + _spdk_blob_persist(seq, blob, _spdk_blob_close_cpl, blob); +} + +/* END spdk_blob_close */ + +struct spdk_io_channel *spdk_bs_alloc_io_channel(struct spdk_blob_store *bs) +{ + return spdk_get_io_channel(bs); +} + +void spdk_bs_free_io_channel(struct spdk_io_channel *channel) +{ + spdk_put_io_channel(channel); +} + +void spdk_blob_io_unmap(struct spdk_blob *blob, struct spdk_io_channel *channel, + uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, + SPDK_BLOB_UNMAP); +} + +void spdk_blob_io_write_zeroes(struct spdk_blob *blob, struct spdk_io_channel *channel, + uint64_t offset, uint64_t length, spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_op(blob, channel, NULL, offset, length, cb_fn, cb_arg, + SPDK_BLOB_WRITE_ZEROES); +} + +void spdk_blob_io_write(struct spdk_blob *blob, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, + SPDK_BLOB_WRITE); +} + +void spdk_blob_io_read(struct spdk_blob *blob, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_op(blob, channel, payload, offset, length, cb_fn, cb_arg, + SPDK_BLOB_READ); +} + +void spdk_blob_io_writev(struct spdk_blob *blob, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, false); +} + +void spdk_blob_io_readv(struct spdk_blob *blob, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, uint64_t offset, uint64_t length, + spdk_blob_op_complete cb_fn, void *cb_arg) +{ + _spdk_blob_request_submit_rw_iov(blob, channel, iov, iovcnt, offset, length, cb_fn, cb_arg, true); +} + +struct spdk_bs_iter_ctx { + int64_t page_num; + struct spdk_blob_store *bs; + + spdk_blob_op_with_handle_complete cb_fn; + void *cb_arg; +}; + +static void +_spdk_bs_iter_cpl(void *cb_arg, struct spdk_blob *_blob, int bserrno) +{ + struct spdk_bs_iter_ctx *ctx = cb_arg; + struct spdk_blob_store *bs = ctx->bs; + spdk_blob_id id; + + if (bserrno == 0) { + ctx->cb_fn(ctx->cb_arg, _blob, bserrno); + free(ctx); + return; + } + + ctx->page_num++; + ctx->page_num = spdk_bit_array_find_first_set(bs->used_blobids, ctx->page_num); + if (ctx->page_num >= spdk_bit_array_capacity(bs->used_blobids)) { + ctx->cb_fn(ctx->cb_arg, NULL, -ENOENT); + free(ctx); + return; + } + + id = _spdk_bs_page_to_blobid(ctx->page_num); + + spdk_bs_open_blob(bs, id, _spdk_bs_iter_cpl, ctx); +} + +void +spdk_bs_iter_first(struct spdk_blob_store *bs, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_iter_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->page_num = -1; + ctx->bs = bs; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + _spdk_bs_iter_cpl(ctx, NULL, -1); +} + +static void +_spdk_bs_iter_close_cpl(void *cb_arg, int bserrno) +{ + struct spdk_bs_iter_ctx *ctx = cb_arg; + + _spdk_bs_iter_cpl(ctx, NULL, -1); +} + +void +spdk_bs_iter_next(struct spdk_blob_store *bs, struct spdk_blob *blob, + spdk_blob_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_bs_iter_ctx *ctx; + + assert(blob != NULL); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + ctx->page_num = _spdk_bs_blobid_to_page(blob->id); + ctx->bs = bs; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + /* Close the existing blob */ + spdk_blob_close(blob, _spdk_bs_iter_close_cpl, ctx); +} + +static int +_spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len, bool internal) +{ + struct spdk_xattr_tailq *xattrs; + struct spdk_xattr *xattr; + + _spdk_blob_verify_md_op(blob); + + if (blob->md_ro) { + return -EPERM; + } + + if (internal) { + xattrs = &blob->xattrs_internal; + blob->invalid_flags |= SPDK_BLOB_INTERNAL_XATTR; + } else { + xattrs = &blob->xattrs; + } + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + free(xattr->value); + xattr->value_len = value_len; + xattr->value = malloc(value_len); + memcpy(xattr->value, value, value_len); + + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; + } + } + + xattr = calloc(1, sizeof(*xattr)); + if (!xattr) { + return -ENOMEM; + } + xattr->name = strdup(name); + xattr->value_len = value_len; + xattr->value = malloc(value_len); + memcpy(xattr->value, value, value_len); + TAILQ_INSERT_TAIL(xattrs, xattr, link); + + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; +} + +int +spdk_blob_set_xattr(struct spdk_blob *blob, const char *name, const void *value, + uint16_t value_len) +{ + return _spdk_blob_set_xattr(blob, name, value, value_len, false); +} + +static int +_spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name, bool internal) +{ + struct spdk_xattr_tailq *xattrs; + struct spdk_xattr *xattr; + + _spdk_blob_verify_md_op(blob); + + if (blob->md_ro) { + return -EPERM; + } + xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + TAILQ_REMOVE(xattrs, xattr, link); + free(xattr->value); + free(xattr->name); + free(xattr); + + if (internal && TAILQ_EMPTY(&blob->xattrs_internal)) { + blob->invalid_flags &= ~SPDK_BLOB_INTERNAL_XATTR; + } + blob->state = SPDK_BLOB_STATE_DIRTY; + + return 0; + } + } + + return -ENOENT; +} + +int +spdk_blob_remove_xattr(struct spdk_blob *blob, const char *name) +{ + return _spdk_blob_remove_xattr(blob, name, false); +} + +static int +_spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len, bool internal) +{ + struct spdk_xattr *xattr; + struct spdk_xattr_tailq *xattrs; + + xattrs = internal ? &blob->xattrs_internal : &blob->xattrs; + + TAILQ_FOREACH(xattr, xattrs, link) { + if (!strcmp(name, xattr->name)) { + *value = xattr->value; + *value_len = xattr->value_len; + return 0; + } + } + return -ENOENT; +} + +int +spdk_blob_get_xattr_value(struct spdk_blob *blob, const char *name, + const void **value, size_t *value_len) +{ + _spdk_blob_verify_md_op(blob); + + return _spdk_blob_get_xattr_value(blob, name, value, value_len, false); +} + +struct spdk_xattr_names { + uint32_t count; + const char *names[0]; +}; + +static int +_spdk_blob_get_xattr_names(struct spdk_xattr_tailq *xattrs, struct spdk_xattr_names **names) +{ + struct spdk_xattr *xattr; + int count = 0; + + TAILQ_FOREACH(xattr, xattrs, link) { + count++; + } + + *names = calloc(1, sizeof(struct spdk_xattr_names) + count * sizeof(char *)); + if (*names == NULL) { + return -ENOMEM; + } + + TAILQ_FOREACH(xattr, xattrs, link) { + (*names)->names[(*names)->count++] = xattr->name; + } + + return 0; +} + +int +spdk_blob_get_xattr_names(struct spdk_blob *blob, struct spdk_xattr_names **names) +{ + _spdk_blob_verify_md_op(blob); + + return _spdk_blob_get_xattr_names(&blob->xattrs, names); +} + +uint32_t +spdk_xattr_names_get_count(struct spdk_xattr_names *names) +{ + assert(names != NULL); + + return names->count; +} + +const char * +spdk_xattr_names_get_name(struct spdk_xattr_names *names, uint32_t index) +{ + if (index >= names->count) { + return NULL; + } + + return names->names[index]; +} + +void +spdk_xattr_names_free(struct spdk_xattr_names *names) +{ + free(names); +} + +struct spdk_bs_type +spdk_bs_get_bstype(struct spdk_blob_store *bs) +{ + return bs->bstype; +} + +void +spdk_bs_set_bstype(struct spdk_blob_store *bs, struct spdk_bs_type bstype) +{ + memcpy(&bs->bstype, &bstype, sizeof(bstype)); +} + +bool +spdk_blob_is_read_only(struct spdk_blob *blob) +{ + assert(blob != NULL); + return (blob->data_ro || blob->md_ro); +} + +bool +spdk_blob_is_snapshot(struct spdk_blob *blob) +{ + struct spdk_blob_list *snapshot_entry; + + assert(blob != NULL); + + TAILQ_FOREACH(snapshot_entry, &blob->bs->snapshots, link) { + if (snapshot_entry->id == blob->id) { + break; + } + } + + if (snapshot_entry == NULL) { + return false; + } + + return true; +} + +bool +spdk_blob_is_clone(struct spdk_blob *blob) +{ + assert(blob != NULL); + + if (blob->parent_id != SPDK_BLOBID_INVALID) { + assert(spdk_blob_is_thin_provisioned(blob)); + return true; + } + + return false; +} + +bool +spdk_blob_is_thin_provisioned(struct spdk_blob *blob) +{ + assert(blob != NULL); + return !!(blob->invalid_flags & SPDK_BLOB_THIN_PROV); +} + +spdk_blob_id +spdk_blob_get_parent_snapshot(struct spdk_blob_store *bs, spdk_blob_id blob_id) +{ + struct spdk_blob_list *snapshot_entry = NULL; + struct spdk_blob_list *clone_entry = NULL; + + TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + if (clone_entry->id == blob_id) { + return snapshot_entry->id; + } + } + } + + return SPDK_BLOBID_INVALID; +} + +int +spdk_blob_get_clones(struct spdk_blob_store *bs, spdk_blob_id blobid, spdk_blob_id *ids, + size_t *count) +{ + struct spdk_blob_list *snapshot_entry, *clone_entry; + size_t n; + + TAILQ_FOREACH(snapshot_entry, &bs->snapshots, link) { + if (snapshot_entry->id == blobid) { + break; + } + } + if (snapshot_entry == NULL) { + *count = 0; + return 0; + } + + if (ids == NULL || *count < snapshot_entry->clone_count) { + *count = snapshot_entry->clone_count; + return -ENOMEM; + } + *count = snapshot_entry->clone_count; + + n = 0; + TAILQ_FOREACH(clone_entry, &snapshot_entry->clones, link) { + ids[n++] = clone_entry->id; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("blob", SPDK_LOG_BLOB) diff --git a/src/spdk/lib/blob/blobstore.h b/src/spdk/lib/blob/blobstore.h new file mode 100644 index 00000000..60df98d8 --- /dev/null +++ b/src/spdk/lib/blob/blobstore.h @@ -0,0 +1,572 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BLOBSTORE_H +#define SPDK_BLOBSTORE_H + +#include "spdk/assert.h" +#include "spdk/blob.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#include "request.h" + +/* In Memory Data Structures + * + * The following data structures exist only in memory. + */ + +#define SPDK_BLOB_OPTS_CLUSTER_SZ (1024 * 1024) +#define SPDK_BLOB_OPTS_NUM_MD_PAGES UINT32_MAX +#define SPDK_BLOB_OPTS_MAX_MD_OPS 32 +#define SPDK_BLOB_OPTS_DEFAULT_CHANNEL_OPS 512 +#define SPDK_BLOB_BLOBID_HIGH_BIT (1ULL << 32) + +struct spdk_xattr { + uint32_t index; + uint16_t value_len; + char *name; + void *value; + TAILQ_ENTRY(spdk_xattr) link; +}; + +/* The mutable part of the blob data that is sync'd to + * disk. The data in here is both mutable and persistent. + */ +struct spdk_blob_mut_data { + /* Number of data clusters in the blob */ + uint64_t num_clusters; + + /* Array LBAs that are the beginning of a cluster, in + * the order they appear in the blob. + */ + uint64_t *clusters; + + /* The size of the clusters array. This is greater than or + * equal to 'num_clusters'. + */ + size_t cluster_array_size; + + /* Number of metadata pages */ + uint32_t num_pages; + + /* Array of page offsets into the metadata region, in + * the order of the metadata page sequence. + */ + uint32_t *pages; +}; + +enum spdk_blob_state { + /* The blob in-memory version does not match the on-disk + * version. + */ + SPDK_BLOB_STATE_DIRTY, + + /* The blob in memory version of the blob matches the on disk + * version. + */ + SPDK_BLOB_STATE_CLEAN, + + /* The in-memory state being synchronized with the on-disk + * blob state. */ + SPDK_BLOB_STATE_LOADING, +}; + +TAILQ_HEAD(spdk_xattr_tailq, spdk_xattr); + +struct spdk_blob_list { + spdk_blob_id id; + size_t clone_count; + TAILQ_HEAD(, spdk_blob_list) clones; + TAILQ_ENTRY(spdk_blob_list) link; +}; + +struct spdk_blob { + struct spdk_blob_store *bs; + + uint32_t open_ref; + + spdk_blob_id id; + spdk_blob_id parent_id; + + enum spdk_blob_state state; + + /* Two copies of the mutable data. One is a version + * that matches the last known data on disk (clean). + * The other (active) is the current data. Syncing + * a blob makes the clean match the active. + */ + struct spdk_blob_mut_data clean; + struct spdk_blob_mut_data active; + + bool invalid; + bool data_ro; + bool md_ro; + + uint64_t invalid_flags; + uint64_t data_ro_flags; + uint64_t md_ro_flags; + + struct spdk_bs_dev *back_bs_dev; + + /* TODO: The xattrs are mutable, but we don't want to be + * copying them unnecessarily. Figure this out. + */ + struct spdk_xattr_tailq xattrs; + struct spdk_xattr_tailq xattrs_internal; + + TAILQ_ENTRY(spdk_blob) link; + + uint32_t frozen_refcnt; + bool resize_in_progress; +}; + +struct spdk_blob_store { + uint64_t md_start; /* Offset from beginning of disk, in pages */ + uint32_t md_len; /* Count, in pages */ + + struct spdk_io_channel *md_channel; + uint32_t max_channel_ops; + + struct spdk_thread *md_thread; + + struct spdk_bs_dev *dev; + + struct spdk_bit_array *used_md_pages; + struct spdk_bit_array *used_clusters; + struct spdk_bit_array *used_blobids; + + pthread_mutex_t used_clusters_mutex; + + uint32_t cluster_sz; + uint64_t total_clusters; + uint64_t total_data_clusters; + uint64_t num_free_clusters; + uint64_t pages_per_cluster; + uint32_t io_unit_size; + + spdk_blob_id super_blob; + struct spdk_bs_type bstype; + + struct spdk_bs_cpl unload_cpl; + int unload_err; + + TAILQ_HEAD(, spdk_blob) blobs; + TAILQ_HEAD(, spdk_blob_list) snapshots; + + bool clean; +}; + +struct spdk_bs_channel { + struct spdk_bs_request_set *req_mem; + TAILQ_HEAD(, spdk_bs_request_set) reqs; + + struct spdk_blob_store *bs; + + struct spdk_bs_dev *dev; + struct spdk_io_channel *dev_channel; + + TAILQ_HEAD(, spdk_bs_request_set) need_cluster_alloc; + TAILQ_HEAD(, spdk_bs_request_set) queued_io; +}; + +/** operation type */ +enum spdk_blob_op_type { + SPDK_BLOB_WRITE, + SPDK_BLOB_READ, + SPDK_BLOB_UNMAP, + SPDK_BLOB_WRITE_ZEROES, + SPDK_BLOB_WRITEV, + SPDK_BLOB_READV, +}; + +/* back bs_dev */ + +#define BLOB_SNAPSHOT "SNAP" +#define SNAPSHOT_IN_PROGRESS "SNAPTMP" + +struct spdk_blob_bs_dev { + struct spdk_bs_dev bs_dev; + struct spdk_blob *blob; +}; + +/* On-Disk Data Structures + * + * The following data structures exist on disk. + */ +#define SPDK_BS_INITIAL_VERSION 1 +#define SPDK_BS_VERSION 3 /* current version */ + +#pragma pack(push, 1) + +#define SPDK_MD_MASK_TYPE_USED_PAGES 0 +#define SPDK_MD_MASK_TYPE_USED_CLUSTERS 1 +#define SPDK_MD_MASK_TYPE_USED_BLOBIDS 2 + +struct spdk_bs_md_mask { + uint8_t type; + uint32_t length; /* In bits */ + uint8_t mask[0]; +}; + +#define SPDK_MD_DESCRIPTOR_TYPE_PADDING 0 +#define SPDK_MD_DESCRIPTOR_TYPE_EXTENT 1 +#define SPDK_MD_DESCRIPTOR_TYPE_XATTR 2 +#define SPDK_MD_DESCRIPTOR_TYPE_FLAGS 3 +#define SPDK_MD_DESCRIPTOR_TYPE_XATTR_INTERNAL 4 + +struct spdk_blob_md_descriptor_xattr { + uint8_t type; + uint32_t length; + + uint16_t name_length; + uint16_t value_length; + + char name[0]; + /* String name immediately followed by string value. */ +}; + +struct spdk_blob_md_descriptor_extent { + uint8_t type; + uint32_t length; + + struct { + uint32_t cluster_idx; + uint32_t length; /* In units of clusters */ + } extents[0]; +}; + +#define SPDK_BLOB_THIN_PROV (1ULL << 0) +#define SPDK_BLOB_INTERNAL_XATTR (1ULL << 1) +#define SPDK_BLOB_INVALID_FLAGS_MASK (SPDK_BLOB_THIN_PROV | SPDK_BLOB_INTERNAL_XATTR) + +#define SPDK_BLOB_READ_ONLY (1ULL << 0) +#define SPDK_BLOB_DATA_RO_FLAGS_MASK SPDK_BLOB_READ_ONLY +#define SPDK_BLOB_MD_RO_FLAGS_MASK 0 + +struct spdk_blob_md_descriptor_flags { + uint8_t type; + uint32_t length; + + /* + * If a flag in invalid_flags is set that the application is not aware of, + * it will not allow the blob to be opened. + */ + uint64_t invalid_flags; + + /* + * If a flag in data_ro_flags is set that the application is not aware of, + * allow the blob to be opened in data_read_only and md_read_only mode. + */ + uint64_t data_ro_flags; + + /* + * If a flag in md_ro_flags is set the the application is not aware of, + * allow the blob to be opened in md_read_only mode. + */ + uint64_t md_ro_flags; +}; + +struct spdk_blob_md_descriptor { + uint8_t type; + uint32_t length; +}; + +#define SPDK_INVALID_MD_PAGE UINT32_MAX + +struct spdk_blob_md_page { + spdk_blob_id id; + + uint32_t sequence_num; + uint32_t reserved0; + + /* Descriptors here */ + uint8_t descriptors[4072]; + + uint32_t next; + uint32_t crc; +}; +#define SPDK_BS_PAGE_SIZE 0x1000 +SPDK_STATIC_ASSERT(SPDK_BS_PAGE_SIZE == sizeof(struct spdk_blob_md_page), "Invalid md page size"); + +#define SPDK_BS_SUPER_BLOCK_SIG "SPDKBLOB" + +struct spdk_bs_super_block { + uint8_t signature[8]; + uint32_t version; + uint32_t length; + uint32_t clean; /* If there was a clean shutdown, this is 1. */ + spdk_blob_id super_blob; + + uint32_t cluster_size; /* In bytes */ + + uint32_t used_page_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_page_mask_len; /* Count, in pages */ + + uint32_t used_cluster_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_cluster_mask_len; /* Count, in pages */ + + uint32_t md_start; /* Offset from beginning of disk, in pages */ + uint32_t md_len; /* Count, in pages */ + + struct spdk_bs_type bstype; /* blobstore type */ + + uint32_t used_blobid_mask_start; /* Offset from beginning of disk, in pages */ + uint32_t used_blobid_mask_len; /* Count, in pages */ + + uint64_t size; /* size of blobstore in bytes */ + uint32_t io_unit_size; /* Size of io unit in bytes */ + + uint8_t reserved[4000]; + uint32_t crc; +}; +SPDK_STATIC_ASSERT(sizeof(struct spdk_bs_super_block) == 0x1000, "Invalid super block size"); + +#pragma pack(pop) + +struct spdk_bs_dev *spdk_bs_create_zeroes_dev(void); +struct spdk_bs_dev *spdk_bs_create_blob_bs_dev(struct spdk_blob *blob); + +/* Unit Conversions + * + * The blobstore works with several different units: + * - Byte: Self explanatory + * - LBA: The logical blocks on the backing storage device. + * - Page: The read/write units of blobs and metadata. This is + * an offset into a blob in units of 4KiB. + * - Cluster Index: The disk is broken into a sequential list of + * clusters. This is the offset from the beginning. + * + * NOTE: These conversions all act on simple magnitudes, not with any sort + * of knowledge about the blobs themselves. For instance, converting + * a page to an lba with the conversion function below simply converts + * a number of pages to an equivalent number of lbas, but that + * lba certainly isn't the right lba that corresponds to a page offset + * for a particular blob. + */ +static inline uint64_t +_spdk_bs_byte_to_lba(struct spdk_blob_store *bs, uint64_t length) +{ + assert(length % bs->dev->blocklen == 0); + + return length / bs->dev->blocklen; +} + +static inline uint64_t +_spdk_bs_dev_byte_to_lba(struct spdk_bs_dev *bs_dev, uint64_t length) +{ + assert(length % bs_dev->blocklen == 0); + + return length / bs_dev->blocklen; +} + +static inline uint64_t +_spdk_bs_page_to_lba(struct spdk_blob_store *bs, uint64_t page) +{ + return page * SPDK_BS_PAGE_SIZE / bs->dev->blocklen; +} + +static inline uint64_t +_spdk_bs_dev_page_to_lba(struct spdk_bs_dev *bs_dev, uint64_t page) +{ + return page * SPDK_BS_PAGE_SIZE / bs_dev->blocklen; +} + +static inline uint64_t +_spdk_bs_io_unit_per_page(struct spdk_blob_store *bs) +{ + return SPDK_BS_PAGE_SIZE / bs->io_unit_size; +} + +static inline uint64_t +_spdk_bs_io_unit_to_page(struct spdk_blob_store *bs, uint64_t io_unit) +{ + return io_unit / _spdk_bs_io_unit_per_page(bs); +} + +static inline uint64_t +_spdk_bs_cluster_to_page(struct spdk_blob_store *bs, uint32_t cluster) +{ + return (uint64_t)cluster * bs->pages_per_cluster; +} + +static inline uint32_t +_spdk_bs_page_to_cluster(struct spdk_blob_store *bs, uint64_t page) +{ + assert(page % bs->pages_per_cluster == 0); + + return page / bs->pages_per_cluster; +} + +static inline uint64_t +_spdk_bs_cluster_to_lba(struct spdk_blob_store *bs, uint32_t cluster) +{ + return (uint64_t)cluster * (bs->cluster_sz / bs->dev->blocklen); +} + +static inline uint32_t +_spdk_bs_lba_to_cluster(struct spdk_blob_store *bs, uint64_t lba) +{ + assert(lba % (bs->cluster_sz / bs->dev->blocklen) == 0); + + return lba / (bs->cluster_sz / bs->dev->blocklen); +} + +static inline uint64_t +_spdk_bs_io_unit_to_back_dev_lba(struct spdk_blob *blob, uint64_t io_unit) +{ + return io_unit * (blob->bs->io_unit_size / blob->back_bs_dev->blocklen); +} + +static inline uint64_t +_spdk_bs_back_dev_lba_to_io_unit(struct spdk_blob *blob, uint64_t lba) +{ + return lba * (blob->back_bs_dev->blocklen / blob->bs->io_unit_size); +} + +/* End basic conversions */ + +static inline uint64_t +_spdk_bs_blobid_to_page(spdk_blob_id id) +{ + return id & 0xFFFFFFFF; +} + +/* The blob id is a 64 bit number. The lower 32 bits are the page_idx. The upper + * 32 bits are not currently used. Stick a 1 there just to catch bugs where the + * code assumes blob id == page_idx. + */ +static inline spdk_blob_id +_spdk_bs_page_to_blobid(uint64_t page_idx) +{ + if (page_idx > UINT32_MAX) { + return SPDK_BLOBID_INVALID; + } + return SPDK_BLOB_BLOBID_HIGH_BIT | page_idx; +} + +/* Given an io unit offset into a blob, look up the LBA for the + * start of that io unit. + */ +static inline uint64_t +_spdk_bs_blob_io_unit_to_lba(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t lba; + uint64_t pages_per_cluster; + uint64_t io_units_per_cluster; + uint64_t io_units_per_page; + uint64_t page; + + page = _spdk_bs_io_unit_to_page(blob->bs, io_unit); + + pages_per_cluster = blob->bs->pages_per_cluster; + io_units_per_page = _spdk_bs_io_unit_per_page(blob->bs); + io_units_per_cluster = io_units_per_page * pages_per_cluster; + + assert(page < blob->active.num_clusters * pages_per_cluster); + + lba = blob->active.clusters[page / pages_per_cluster]; + lba += io_unit % io_units_per_cluster; + return lba; +} + +/* Given an io_unit offset into a blob, look up the number of io_units until the + * next cluster boundary. + */ +static inline uint32_t +_spdk_bs_num_io_units_to_cluster_boundary(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t io_units_per_cluster; + + io_units_per_cluster = _spdk_bs_io_unit_per_page(blob->bs) * blob->bs->pages_per_cluster; + + return io_units_per_cluster - (io_unit % io_units_per_cluster); +} + +/* Given a page offset into a blob, look up the number of pages until the + * next cluster boundary. + */ +static inline uint32_t +_spdk_bs_num_pages_to_cluster_boundary(struct spdk_blob *blob, uint64_t page) +{ + uint64_t pages_per_cluster; + + pages_per_cluster = blob->bs->pages_per_cluster; + + return pages_per_cluster - (page % pages_per_cluster); +} + +/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ +static inline uint32_t +_spdk_bs_io_unit_to_cluster_start(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t pages_per_cluster; + uint64_t page; + + pages_per_cluster = blob->bs->pages_per_cluster; + page = _spdk_bs_io_unit_to_page(blob->bs, io_unit); + + return page - (page % pages_per_cluster); +} + +/* Given an io_unit offset into a blob, look up the number of pages into blob to beginning of current cluster */ +static inline uint32_t +_spdk_bs_io_unit_to_cluster_number(struct spdk_blob *blob, uint64_t io_unit) +{ + return (io_unit / _spdk_bs_io_unit_per_page(blob->bs)) / blob->bs->pages_per_cluster; +} + +/* Given an io unit offset into a blob, look up if it is from allocated cluster. */ +static inline bool +_spdk_bs_io_unit_is_allocated(struct spdk_blob *blob, uint64_t io_unit) +{ + uint64_t lba; + uint64_t page; + uint64_t pages_per_cluster; + + pages_per_cluster = blob->bs->pages_per_cluster; + page = _spdk_bs_io_unit_to_page(blob->bs, io_unit); + + assert(page < blob->active.num_clusters * pages_per_cluster); + + lba = blob->active.clusters[page / pages_per_cluster]; + + if (lba == 0) { + assert(spdk_blob_is_thin_provisioned(blob)); + return false; + } else { + return true; + } +} + +#endif diff --git a/src/spdk/lib/blob/request.c b/src/spdk/lib/blob/request.c new file mode 100644 index 00000000..b66fa765 --- /dev/null +++ b/src/spdk/lib/blob/request.c @@ -0,0 +1,558 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "blobstore.h" +#include "request.h" + +#include "spdk/thread.h" +#include "spdk/queue.h" + +#include "spdk_internal/log.h" + +void +spdk_bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno) +{ + switch (cpl->type) { + case SPDK_BS_CPL_TYPE_BS_BASIC: + cpl->u.bs_basic.cb_fn(cpl->u.bs_basic.cb_arg, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BS_HANDLE: + cpl->u.bs_handle.cb_fn(cpl->u.bs_handle.cb_arg, + cpl->u.bs_handle.bs, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_BASIC: + cpl->u.blob_basic.cb_fn(cpl->u.blob_basic.cb_arg, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOBID: + cpl->u.blobid.cb_fn(cpl->u.blobid.cb_arg, + cpl->u.blobid.blobid, + bserrno); + break; + case SPDK_BS_CPL_TYPE_BLOB_HANDLE: + cpl->u.blob_handle.cb_fn(cpl->u.blob_handle.cb_arg, + cpl->u.blob_handle.blob, + bserrno); + break; + case SPDK_BS_CPL_TYPE_NESTED_SEQUENCE: + cpl->u.nested_seq.cb_fn(cpl->u.nested_seq.cb_arg, + cpl->u.nested_seq.parent, + bserrno); + break; + case SPDK_BS_CPL_TYPE_NONE: + /* this completion's callback is handled elsewhere */ + break; + } +} + +static void +spdk_bs_request_set_complete(struct spdk_bs_request_set *set) +{ + struct spdk_bs_cpl cpl = set->cpl; + int bserrno = set->bserrno; + + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); + + spdk_bs_call_cpl(&cpl, bserrno); +} + +static void +spdk_bs_sequence_completion(struct spdk_io_channel *channel, void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = cb_arg; + + set->bserrno = bserrno; + set->u.sequence.cb_fn((spdk_bs_sequence_t *)set, set->u.sequence.cb_arg, bserrno); +} + +spdk_bs_sequence_t * +spdk_bs_sequence_start(struct spdk_io_channel *_channel, + struct spdk_bs_cpl *cpl) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + + channel = spdk_io_channel_get_ctx(_channel); + + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->bserrno = 0; + set->channel = channel; + + set->cb_args.cb_fn = spdk_bs_sequence_completion; + set->cb_args.cb_arg = set; + set->cb_args.channel = channel->dev_channel; + + return (spdk_bs_sequence_t *)set; +} + +void +spdk_bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args); +} + +void +spdk_bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args); +} + +void +spdk_bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_readv_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + bs_dev->readv(bs_dev, spdk_io_channel_from_ctx(channel), iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_readv_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + channel->dev->readv(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_writev_dev(spdk_bs_sequence_t *seq, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->writev(channel->dev, channel->dev_channel, iov, iovcnt, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_unmap_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Unmapping %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "writing zeroes to %" PRIu32 " blocks at LBA %" PRIu64 "\n", + lba_count, lba); + + set->u.sequence.cb_fn = cb_fn; + set->u.sequence.cb_arg = cb_arg; + + channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno) +{ + if (bserrno != 0) { + seq->bserrno = bserrno; + } + spdk_bs_request_set_complete((struct spdk_bs_request_set *)seq); +} + +void +spdk_bs_user_op_sequence_finish(void *cb_arg, int bserrno) +{ + spdk_bs_sequence_t *seq = cb_arg; + + spdk_bs_sequence_finish(seq, bserrno); +} + +static void +spdk_bs_batch_completion(struct spdk_io_channel *_channel, + void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = cb_arg; + + set->u.batch.outstanding_ops--; + if (bserrno != 0) { + set->bserrno = bserrno; + } + + if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) { + if (set->u.batch.cb_fn) { + set->cb_args.cb_fn = spdk_bs_sequence_completion; + set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, bserrno); + } else { + spdk_bs_request_set_complete(set); + } + } +} + +spdk_bs_batch_t * +spdk_bs_batch_open(struct spdk_io_channel *_channel, + struct spdk_bs_cpl *cpl) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + + channel = spdk_io_channel_get_ctx(_channel); + + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->bserrno = 0; + set->channel = channel; + + set->u.batch.cb_fn = NULL; + set->u.batch.cb_arg = NULL; + set->u.batch.outstanding_ops = 0; + set->u.batch.batch_closed = 0; + + set->cb_args.cb_fn = spdk_bs_batch_completion; + set->cb_args.cb_arg = set; + set->cb_args.channel = channel->dev_channel; + + return (spdk_bs_batch_t *)set; +} + +void +spdk_bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + bs_dev->read(bs_dev, spdk_io_channel_from_ctx(channel), payload, lba, lba_count, &set->cb_args); +} + +void +spdk_bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Reading %" PRIu32 " blocks from LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + channel->dev->read(channel->dev, channel->dev_channel, payload, lba, lba_count, &set->cb_args); +} + +void +spdk_bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Writing %" PRIu32 " blocks to LBA %" PRIu64 "\n", lba_count, lba); + + set->u.batch.outstanding_ops++; + channel->dev->write(channel->dev, channel->dev_channel, payload, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_batch_unmap_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Unmapping %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, + lba); + + set->u.batch.outstanding_ops++; + channel->dev->unmap(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + struct spdk_bs_channel *channel = set->channel; + + SPDK_DEBUGLOG(SPDK_LOG_BLOB_RW, "Zeroing %" PRIu32 " blocks at LBA %" PRIu64 "\n", lba_count, lba); + + set->u.batch.outstanding_ops++; + channel->dev->write_zeroes(channel->dev, channel->dev_channel, lba, lba_count, + &set->cb_args); +} + +void +spdk_bs_batch_close(spdk_bs_batch_t *batch) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + + set->u.batch.batch_closed = 1; + + if (set->u.batch.outstanding_ops == 0) { + if (set->u.batch.cb_fn) { + set->cb_args.cb_fn = spdk_bs_sequence_completion; + set->u.batch.cb_fn((spdk_bs_sequence_t *)set, set->u.batch.cb_arg, set->bserrno); + } else { + spdk_bs_request_set_complete(set); + } + } +} + +spdk_bs_batch_t * +spdk_bs_sequence_to_batch(spdk_bs_sequence_t *seq, spdk_bs_sequence_cpl cb_fn, void *cb_arg) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)seq; + + set->u.batch.cb_fn = cb_fn; + set->u.batch.cb_arg = cb_arg; + set->u.batch.outstanding_ops = 0; + set->u.batch.batch_closed = 0; + + set->cb_args.cb_fn = spdk_bs_batch_completion; + + return set; +} + +spdk_bs_sequence_t * +spdk_bs_batch_to_sequence(spdk_bs_batch_t *batch) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)batch; + + set->u.batch.outstanding_ops++; + + set->cpl.type = SPDK_BS_CPL_TYPE_BLOB_BASIC; + set->cpl.u.blob_basic.cb_fn = spdk_bs_sequence_to_batch_completion; + set->cpl.u.blob_basic.cb_arg = set; + set->bserrno = 0; + + set->cb_args.cb_fn = spdk_bs_sequence_completion; + set->cb_args.cb_arg = set; + set->cb_args.channel = set->channel->dev_channel; + + return (spdk_bs_sequence_t *)set; +} + +spdk_bs_user_op_t * +spdk_bs_user_op_alloc(struct spdk_io_channel *_channel, struct spdk_bs_cpl *cpl, + enum spdk_blob_op_type op_type, struct spdk_blob *blob, + void *payload, int iovcnt, uint64_t offset, uint64_t length) +{ + struct spdk_bs_channel *channel; + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + + channel = spdk_io_channel_get_ctx(_channel); + + set = TAILQ_FIRST(&channel->reqs); + if (!set) { + return NULL; + } + TAILQ_REMOVE(&channel->reqs, set, link); + + set->cpl = *cpl; + set->channel = channel; + + args = &set->u.user_op; + + args->type = op_type; + args->iovcnt = iovcnt; + args->blob = blob; + args->offset = offset; + args->length = length; + args->payload = payload; + + return (spdk_bs_user_op_t *)set; +} + +void +spdk_bs_user_op_execute(spdk_bs_user_op_t *op) +{ + struct spdk_bs_request_set *set; + struct spdk_bs_user_op_args *args; + struct spdk_io_channel *ch; + + set = (struct spdk_bs_request_set *)op; + args = &set->u.user_op; + ch = spdk_io_channel_from_ctx(set->channel); + + switch (args->type) { + case SPDK_BLOB_READ: + spdk_blob_io_read(args->blob, ch, args->payload, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITE: + spdk_blob_io_write(args->blob, ch, args->payload, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_UNMAP: + spdk_blob_io_unmap(args->blob, ch, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITE_ZEROES: + spdk_blob_io_write_zeroes(args->blob, ch, args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_READV: + spdk_blob_io_readv(args->blob, ch, args->payload, args->iovcnt, + args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + case SPDK_BLOB_WRITEV: + spdk_blob_io_writev(args->blob, ch, args->payload, args->iovcnt, + args->offset, args->length, + set->cpl.u.blob_basic.cb_fn, set->cpl.u.blob_basic.cb_arg); + break; + } + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); +} + +void +spdk_bs_user_op_abort(spdk_bs_user_op_t *op) +{ + struct spdk_bs_request_set *set; + + set = (struct spdk_bs_request_set *)op; + + set->cpl.u.blob_basic.cb_fn(set->cpl.u.blob_basic.cb_arg, -EIO); + TAILQ_INSERT_TAIL(&set->channel->reqs, set, link); +} + +void +spdk_bs_sequence_to_batch_completion(void *cb_arg, int bserrno) +{ + struct spdk_bs_request_set *set = (struct spdk_bs_request_set *)cb_arg; + + set->u.batch.outstanding_ops--; + + if (set->u.batch.outstanding_ops == 0 && set->u.batch.batch_closed) { + if (set->cb_args.cb_fn) { + set->cb_args.cb_fn(set->cb_args.channel, set->cb_args.cb_arg, bserrno); + } + } +} + +SPDK_LOG_REGISTER_COMPONENT("blob_rw", SPDK_LOG_BLOB_RW) diff --git a/src/spdk/lib/blob/request.h b/src/spdk/lib/blob/request.h new file mode 100644 index 00000000..4efb5cd5 --- /dev/null +++ b/src/spdk/lib/blob/request.h @@ -0,0 +1,223 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BS_REQUEST_H +#define SPDK_BS_REQUEST_H + +#include "spdk/stdinc.h" + +#include "spdk/blob.h" + +enum spdk_bs_cpl_type { + SPDK_BS_CPL_TYPE_NONE, + SPDK_BS_CPL_TYPE_BS_BASIC, + SPDK_BS_CPL_TYPE_BS_HANDLE, + SPDK_BS_CPL_TYPE_BLOB_BASIC, + SPDK_BS_CPL_TYPE_BLOBID, + SPDK_BS_CPL_TYPE_BLOB_HANDLE, + SPDK_BS_CPL_TYPE_NESTED_SEQUENCE, +}; + +enum spdk_blob_op_type; + +struct spdk_bs_request_set; + +/* Use a sequence to submit a set of requests serially */ +typedef struct spdk_bs_request_set spdk_bs_sequence_t; + +/* Use a batch to submit a set of requests in parallel */ +typedef struct spdk_bs_request_set spdk_bs_batch_t; + +/* Use a user_op to queue a user operation for later execution */ +typedef struct spdk_bs_request_set spdk_bs_user_op_t; + +typedef void (*spdk_bs_nested_seq_complete)(void *cb_arg, spdk_bs_sequence_t *parent, int bserrno); + +struct spdk_bs_cpl { + enum spdk_bs_cpl_type type; + union { + struct { + spdk_bs_op_complete cb_fn; + void *cb_arg; + } bs_basic; + + struct { + spdk_bs_op_with_handle_complete cb_fn; + void *cb_arg; + struct spdk_blob_store *bs; + } bs_handle; + + struct { + spdk_blob_op_complete cb_fn; + void *cb_arg; + } blob_basic; + + struct { + spdk_blob_op_with_id_complete cb_fn; + void *cb_arg; + spdk_blob_id blobid; + } blobid; + + struct { + spdk_blob_op_with_handle_complete cb_fn; + void *cb_arg; + struct spdk_blob *blob; + } blob_handle; + + struct { + spdk_bs_nested_seq_complete cb_fn; + void *cb_arg; + spdk_bs_sequence_t *parent; + } nested_seq; + } u; +}; + +typedef void (*spdk_bs_sequence_cpl)(spdk_bs_sequence_t *sequence, + void *cb_arg, int bserrno); + +/* A generic request set. Can be a sequence, batch or a user_op. */ +struct spdk_bs_request_set { + struct spdk_bs_cpl cpl; + + int bserrno; + + struct spdk_bs_channel *channel; + + struct spdk_bs_dev_cb_args cb_args; + + union { + struct { + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; + } sequence; + + struct { + uint32_t outstanding_ops; + uint32_t batch_closed; + spdk_bs_sequence_cpl cb_fn; + void *cb_arg; + } batch; + + struct spdk_bs_user_op_args { + int type; + int iovcnt; + struct spdk_blob *blob; + uint64_t offset; + uint64_t length; + spdk_blob_op_complete cb_fn; + void *cb_arg; + void *payload; /* cast to iov for readv/writev */ + } user_op; + } u; + + TAILQ_ENTRY(spdk_bs_request_set) link; +}; + +void spdk_bs_call_cpl(struct spdk_bs_cpl *cpl, int bserrno); + +spdk_bs_sequence_t *spdk_bs_sequence_start(struct spdk_io_channel *channel, + struct spdk_bs_cpl *cpl); + +void spdk_bs_sequence_read_bs_dev(spdk_bs_sequence_t *seq, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_read_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_write_dev(spdk_bs_sequence_t *seq, void *payload, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_readv_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + struct iovec *iov, int iovcnt, uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_readv_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_writev_dev(spdk_bs_batch_t *batch, struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_unmap_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_write_zeroes_dev(spdk_bs_sequence_t *seq, + uint64_t lba, uint32_t lba_count, + spdk_bs_sequence_cpl cb_fn, void *cb_arg); + +void spdk_bs_sequence_finish(spdk_bs_sequence_t *seq, int bserrno); + +void spdk_bs_user_op_sequence_finish(void *cb_arg, int bserrno); + +spdk_bs_batch_t *spdk_bs_batch_open(struct spdk_io_channel *channel, + struct spdk_bs_cpl *cpl); + +void spdk_bs_batch_read_bs_dev(spdk_bs_batch_t *batch, struct spdk_bs_dev *bs_dev, + void *payload, uint64_t lba, uint32_t lba_count); + +void spdk_bs_batch_read_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count); + +void spdk_bs_batch_write_dev(spdk_bs_batch_t *batch, void *payload, + uint64_t lba, uint32_t lba_count); + +void spdk_bs_batch_unmap_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count); + +void spdk_bs_batch_write_zeroes_dev(spdk_bs_batch_t *batch, + uint64_t lba, uint32_t lba_count); + +void spdk_bs_batch_close(spdk_bs_batch_t *batch); + +spdk_bs_batch_t *spdk_bs_sequence_to_batch(spdk_bs_sequence_t *seq, + spdk_bs_sequence_cpl cb_fn, + void *cb_arg); + +spdk_bs_sequence_t *spdk_bs_batch_to_sequence(spdk_bs_batch_t *batch); + +spdk_bs_user_op_t *spdk_bs_user_op_alloc(struct spdk_io_channel *channel, struct spdk_bs_cpl *cpl, + enum spdk_blob_op_type op_type, struct spdk_blob *blob, + void *payload, int iovcnt, uint64_t offset, uint64_t length); + +void spdk_bs_user_op_execute(spdk_bs_user_op_t *op); + +void spdk_bs_user_op_abort(spdk_bs_user_op_t *op); + +void spdk_bs_sequence_to_batch_completion(void *cb_arg, int bserrno); + +#endif diff --git a/src/spdk/lib/blob/zeroes.c b/src/spdk/lib/blob/zeroes.c new file mode 100644 index 00000000..5b482417 --- /dev/null +++ b/src/spdk/lib/blob/zeroes.c @@ -0,0 +1,122 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/blob.h" + +#include "blobstore.h" + +static void +zeroes_destroy(struct spdk_bs_dev *bs_dev) +{ + return; +} + +static void +zeroes_read(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + memset(payload, 0, dev->blocklen * lba_count); + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + +static void +zeroes_write(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, void *payload, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +zeroes_readv(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, struct spdk_bs_dev_cb_args *cb_args) +{ + int i; + + for (i = 0; i < iovcnt; i++) { + memset(iov[i].iov_base, 0, iov[i].iov_len); + } + + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + + + +static void +zeroes_writev(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + struct iovec *iov, int iovcnt, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, -EPERM); + assert(false); +} + +static void +zeroes_write_zeroes(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + +static void +zeroes_unmap(struct spdk_bs_dev *dev, struct spdk_io_channel *channel, + uint64_t lba, uint32_t lba_count, + struct spdk_bs_dev_cb_args *cb_args) +{ + cb_args->cb_fn(cb_args->channel, cb_args->cb_arg, 0); +} + +static struct spdk_bs_dev g_zeroes_bs_dev = { + .blockcnt = UINT64_MAX, + .blocklen = 512, + .create_channel = NULL, + .destroy_channel = NULL, + .destroy = zeroes_destroy, + .read = zeroes_read, + .write = zeroes_write, + .readv = zeroes_readv, + .writev = zeroes_writev, + .write_zeroes = zeroes_write_zeroes, + .unmap = zeroes_unmap, +}; + +struct spdk_bs_dev * +spdk_bs_create_zeroes_dev(void) +{ + return &g_zeroes_bs_dev; +} diff --git a/src/spdk/lib/blobfs/Makefile b/src/spdk/lib/blobfs/Makefile new file mode 100644 index 00000000..ea36b6ab --- /dev/null +++ b/src/spdk/lib/blobfs/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = blobfs.c tree.c +LIBNAME = blobfs + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/blobfs/blobfs.c b/src/spdk/lib/blobfs/blobfs.c new file mode 100644 index 00000000..48e9f481 --- /dev/null +++ b/src/spdk/lib/blobfs/blobfs.c @@ -0,0 +1,2617 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blobfs.h" +#include "spdk/conf.h" +#include "blobfs_internal.h" + +#include "spdk/queue.h" +#include "spdk/thread.h" +#include "spdk/assert.h" +#include "spdk/env.h" +#include "spdk/util.h" +#include "spdk_internal/log.h" + +#define BLOBFS_TRACE(file, str, args...) \ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s " str, file->name, ##args) + +#define BLOBFS_TRACE_RW(file, str, args...) \ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS_RW, "file=%s " str, file->name, ##args) + +#define BLOBFS_DEFAULT_CACHE_SIZE (4ULL * 1024 * 1024 * 1024) +#define SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ (1024 * 1024) + +static uint64_t g_fs_cache_size = BLOBFS_DEFAULT_CACHE_SIZE; +static struct spdk_mempool *g_cache_pool; +static TAILQ_HEAD(, spdk_file) g_caches; +static int g_fs_count = 0; +static pthread_mutex_t g_cache_init_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_spinlock_t g_caches_lock; + +void +spdk_cache_buffer_free(struct cache_buffer *cache_buffer) +{ + spdk_mempool_put(g_cache_pool, cache_buffer->buf); + free(cache_buffer); +} + +#define CACHE_READAHEAD_THRESHOLD (128 * 1024) + +struct spdk_file { + struct spdk_filesystem *fs; + struct spdk_blob *blob; + char *name; + uint64_t length; + bool is_deleted; + bool open_for_writing; + uint64_t length_flushed; + uint64_t append_pos; + uint64_t seq_byte_count; + uint64_t next_seq_offset; + uint32_t priority; + TAILQ_ENTRY(spdk_file) tailq; + spdk_blob_id blobid; + uint32_t ref_count; + pthread_spinlock_t lock; + struct cache_buffer *last; + struct cache_tree *tree; + TAILQ_HEAD(open_requests_head, spdk_fs_request) open_requests; + TAILQ_HEAD(sync_requests_head, spdk_fs_request) sync_requests; + TAILQ_ENTRY(spdk_file) cache_tailq; +}; + +struct spdk_deleted_file { + spdk_blob_id id; + TAILQ_ENTRY(spdk_deleted_file) tailq; +}; + +struct spdk_filesystem { + struct spdk_blob_store *bs; + TAILQ_HEAD(, spdk_file) files; + struct spdk_bs_opts bs_opts; + struct spdk_bs_dev *bdev; + fs_send_request_fn send_request; + + struct { + uint32_t max_ops; + struct spdk_io_channel *sync_io_channel; + struct spdk_fs_channel *sync_fs_channel; + } sync_target; + + struct { + uint32_t max_ops; + struct spdk_io_channel *md_io_channel; + struct spdk_fs_channel *md_fs_channel; + } md_target; + + struct { + uint32_t max_ops; + } io_target; +}; + +struct spdk_fs_cb_args { + union { + spdk_fs_op_with_handle_complete fs_op_with_handle; + spdk_fs_op_complete fs_op; + spdk_file_op_with_handle_complete file_op_with_handle; + spdk_file_op_complete file_op; + spdk_file_stat_op_complete stat_op; + } fn; + void *arg; + sem_t *sem; + struct spdk_filesystem *fs; + struct spdk_file *file; + int rc; + bool from_request; + union { + struct { + TAILQ_HEAD(, spdk_deleted_file) deleted_files; + } fs_load; + struct { + uint64_t length; + } truncate; + struct { + struct spdk_io_channel *channel; + void *user_buf; + void *pin_buf; + int is_read; + off_t offset; + size_t length; + uint64_t start_lba; + uint64_t num_lba; + uint32_t blocklen; + } rw; + struct { + const char *old_name; + const char *new_name; + } rename; + struct { + struct cache_buffer *cache_buffer; + uint64_t length; + } flush; + struct { + struct cache_buffer *cache_buffer; + uint64_t length; + uint64_t offset; + } readahead; + struct { + uint64_t offset; + TAILQ_ENTRY(spdk_fs_request) tailq; + bool xattr_in_progress; + } sync; + struct { + uint32_t num_clusters; + } resize; + struct { + const char *name; + uint32_t flags; + TAILQ_ENTRY(spdk_fs_request) tailq; + } open; + struct { + const char *name; + struct spdk_blob *blob; + } create; + struct { + const char *name; + } delete; + struct { + const char *name; + } stat; + } op; +}; + +static void cache_free_buffers(struct spdk_file *file); + +void +spdk_fs_opts_init(struct spdk_blobfs_opts *opts) +{ + opts->cluster_sz = SPDK_BLOBFS_DEFAULT_OPTS_CLUSTER_SZ; +} + +static void +__initialize_cache(void) +{ + assert(g_cache_pool == NULL); + + g_cache_pool = spdk_mempool_create("spdk_fs_cache", + g_fs_cache_size / CACHE_BUFFER_SIZE, + CACHE_BUFFER_SIZE, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!g_cache_pool) { + SPDK_ERRLOG("Create mempool failed, you may " + "increase the memory and try again\n"); + assert(false); + } + TAILQ_INIT(&g_caches); + pthread_spin_init(&g_caches_lock, 0); +} + +static void +__free_cache(void) +{ + assert(g_cache_pool != NULL); + + spdk_mempool_free(g_cache_pool); + g_cache_pool = NULL; +} + +static uint64_t +__file_get_blob_size(struct spdk_file *file) +{ + uint64_t cluster_sz; + + cluster_sz = file->fs->bs_opts.cluster_sz; + return cluster_sz * spdk_blob_get_num_clusters(file->blob); +} + +struct spdk_fs_request { + struct spdk_fs_cb_args args; + TAILQ_ENTRY(spdk_fs_request) link; + struct spdk_fs_channel *channel; +}; + +struct spdk_fs_channel { + struct spdk_fs_request *req_mem; + TAILQ_HEAD(, spdk_fs_request) reqs; + sem_t sem; + struct spdk_filesystem *fs; + struct spdk_io_channel *bs_channel; + fs_send_request_fn send_request; + bool sync; + pthread_spinlock_t lock; +}; + +static struct spdk_fs_request * +alloc_fs_request(struct spdk_fs_channel *channel) +{ + struct spdk_fs_request *req; + + if (channel->sync) { + pthread_spin_lock(&channel->lock); + } + + req = TAILQ_FIRST(&channel->reqs); + if (req) { + TAILQ_REMOVE(&channel->reqs, req, link); + } + + if (channel->sync) { + pthread_spin_unlock(&channel->lock); + } + + if (req == NULL) { + return NULL; + } + memset(req, 0, sizeof(*req)); + req->channel = channel; + req->args.from_request = true; + + return req; +} + +static void +free_fs_request(struct spdk_fs_request *req) +{ + struct spdk_fs_channel *channel = req->channel; + + if (channel->sync) { + pthread_spin_lock(&channel->lock); + } + + TAILQ_INSERT_HEAD(&req->channel->reqs, req, link); + + if (channel->sync) { + pthread_spin_unlock(&channel->lock); + } +} + +static int +_spdk_fs_channel_create(struct spdk_filesystem *fs, struct spdk_fs_channel *channel, + uint32_t max_ops) +{ + uint32_t i; + + channel->req_mem = calloc(max_ops, sizeof(struct spdk_fs_request)); + if (!channel->req_mem) { + return -1; + } + + TAILQ_INIT(&channel->reqs); + sem_init(&channel->sem, 0, 0); + + for (i = 0; i < max_ops; i++) { + TAILQ_INSERT_TAIL(&channel->reqs, &channel->req_mem[i], link); + } + + channel->fs = fs; + + return 0; +} + +static int +_spdk_fs_md_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, md_target); + + return _spdk_fs_channel_create(fs, channel, fs->md_target.max_ops); +} + +static int +_spdk_fs_sync_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, sync_target); + + return _spdk_fs_channel_create(fs, channel, fs->sync_target.max_ops); +} + +static int +_spdk_fs_io_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_filesystem *fs; + struct spdk_fs_channel *channel = ctx_buf; + + fs = SPDK_CONTAINEROF(io_device, struct spdk_filesystem, io_target); + + return _spdk_fs_channel_create(fs, channel, fs->io_target.max_ops); +} + +static void +_spdk_fs_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_fs_channel *channel = ctx_buf; + + free(channel->req_mem); + if (channel->bs_channel != NULL) { + spdk_bs_free_io_channel(channel->bs_channel); + } +} + +static void +__send_request_direct(fs_request_fn fn, void *arg) +{ + fn(arg); +} + +static void +common_fs_bs_init(struct spdk_filesystem *fs, struct spdk_blob_store *bs) +{ + fs->bs = bs; + fs->bs_opts.cluster_sz = spdk_bs_get_cluster_size(bs); + fs->md_target.md_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs->md_target.md_fs_channel->send_request = __send_request_direct; + fs->sync_target.sync_fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs->sync_target.sync_fs_channel->send_request = __send_request_direct; + + pthread_mutex_lock(&g_cache_init_lock); + if (g_fs_count == 0) { + __initialize_cache(); + } + g_fs_count++; + pthread_mutex_unlock(&g_cache_init_lock); +} + +static void +init_cb(void *ctx, struct spdk_blob_store *bs, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + if (bserrno == 0) { + common_fs_bs_init(fs, bs); + } else { + free(fs); + fs = NULL; + } + + args->fn.fs_op_with_handle(args->arg, fs, bserrno); + free_fs_request(req); +} + +static void +fs_conf_parse(void) +{ + struct spdk_conf_section *sp; + + sp = spdk_conf_find_section(NULL, "Blobfs"); + if (sp == NULL) { + g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + return; + } + + g_fs_cache_buffer_shift = spdk_conf_section_get_intval(sp, "CacheBufferShift"); + if (g_fs_cache_buffer_shift <= 0) { + g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + } +} + +static struct spdk_filesystem * +fs_alloc(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn) +{ + struct spdk_filesystem *fs; + + fs = calloc(1, sizeof(*fs)); + if (fs == NULL) { + return NULL; + } + + fs->bdev = dev; + fs->send_request = send_request_fn; + TAILQ_INIT(&fs->files); + + fs->md_target.max_ops = 512; + spdk_io_device_register(&fs->md_target, _spdk_fs_md_channel_create, _spdk_fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_md"); + fs->md_target.md_io_channel = spdk_get_io_channel(&fs->md_target); + fs->md_target.md_fs_channel = spdk_io_channel_get_ctx(fs->md_target.md_io_channel); + + fs->sync_target.max_ops = 512; + spdk_io_device_register(&fs->sync_target, _spdk_fs_sync_channel_create, _spdk_fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_sync"); + fs->sync_target.sync_io_channel = spdk_get_io_channel(&fs->sync_target); + fs->sync_target.sync_fs_channel = spdk_io_channel_get_ctx(fs->sync_target.sync_io_channel); + + fs->io_target.max_ops = 512; + spdk_io_device_register(&fs->io_target, _spdk_fs_io_channel_create, _spdk_fs_channel_destroy, + sizeof(struct spdk_fs_channel), "blobfs_io"); + + return fs; +} + +static void +__wake_caller(void *arg, int fserrno) +{ + struct spdk_fs_cb_args *args = arg; + + args->rc = fserrno; + sem_post(args->sem); +} + +void +spdk_fs_init(struct spdk_bs_dev *dev, struct spdk_blobfs_opts *opt, + fs_send_request_fn send_request_fn, + spdk_fs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + struct spdk_bs_opts opts = {}; + + fs = fs_alloc(dev, send_request_fn); + if (fs == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + fs_conf_parse(); + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + spdk_put_io_channel(fs->md_target.md_io_channel); + spdk_io_device_unregister(&fs->md_target, NULL); + spdk_put_io_channel(fs->sync_target.sync_io_channel); + spdk_io_device_unregister(&fs->sync_target, NULL); + spdk_io_device_unregister(&fs->io_target, NULL); + free(fs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op_with_handle = cb_fn; + args->arg = cb_arg; + args->fs = fs; + + spdk_bs_opts_init(&opts); + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "BLOBFS"); + if (opt) { + opts.cluster_sz = opt->cluster_sz; + } + spdk_bs_init(dev, &opts, init_cb, req); +} + +static struct spdk_file * +file_alloc(struct spdk_filesystem *fs) +{ + struct spdk_file *file; + + file = calloc(1, sizeof(*file)); + if (file == NULL) { + return NULL; + } + + file->tree = calloc(1, sizeof(*file->tree)); + if (file->tree == NULL) { + free(file); + return NULL; + } + + file->fs = fs; + TAILQ_INIT(&file->open_requests); + TAILQ_INIT(&file->sync_requests); + pthread_spin_init(&file->lock, 0); + TAILQ_INSERT_TAIL(&fs->files, file, tailq); + file->priority = SPDK_FILE_PRIORITY_LOW; + return file; +} + +static void fs_load_done(void *ctx, int bserrno); + +static int +_handle_deleted_files(struct spdk_fs_request *req) +{ + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + if (!TAILQ_EMPTY(&args->op.fs_load.deleted_files)) { + struct spdk_deleted_file *deleted_file; + + deleted_file = TAILQ_FIRST(&args->op.fs_load.deleted_files); + TAILQ_REMOVE(&args->op.fs_load.deleted_files, deleted_file, tailq); + spdk_bs_delete_blob(fs->bs, deleted_file->id, fs_load_done, req); + free(deleted_file); + return 0; + } + + return 1; +} + +static void +fs_load_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + + /* The filesystem has been loaded. Now check if there are any files that + * were marked for deletion before last unload. Do not complete the + * fs_load callback until all of them have been deleted on disk. + */ + if (_handle_deleted_files(req) == 0) { + /* We found a file that's been marked for deleting but not actually + * deleted yet. This function will get called again once the delete + * operation is completed. + */ + return; + } + + args->fn.fs_op_with_handle(args->arg, fs, 0); + free_fs_request(req); + +} + +static void +iter_cb(void *ctx, struct spdk_blob *blob, int rc) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + uint64_t *length; + const char *name; + uint32_t *is_deleted; + size_t value_len; + + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&name, &value_len); + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "length", (const void **)&length, &value_len); + if (rc < 0) { + args->fn.fs_op_with_handle(args->arg, fs, rc); + free_fs_request(req); + return; + } + + assert(value_len == 8); + + /* This file could be deleted last time without close it, then app crashed, so we delete it now */ + rc = spdk_blob_get_xattr_value(blob, "is_deleted", (const void **)&is_deleted, &value_len); + if (rc < 0) { + struct spdk_file *f; + + f = file_alloc(fs); + if (f == NULL) { + args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM); + free_fs_request(req); + return; + } + + f->name = strdup(name); + f->blobid = spdk_blob_get_id(blob); + f->length = *length; + f->length_flushed = *length; + f->append_pos = *length; + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "added file %s length=%ju\n", f->name, f->length); + } else { + struct spdk_deleted_file *deleted_file; + + deleted_file = calloc(1, sizeof(*deleted_file)); + if (deleted_file == NULL) { + args->fn.fs_op_with_handle(args->arg, fs, -ENOMEM); + free_fs_request(req); + return; + } + deleted_file->id = spdk_blob_get_id(blob); + TAILQ_INSERT_TAIL(&args->op.fs_load.deleted_files, deleted_file, tailq); + } +} + +static void +load_cb(void *ctx, struct spdk_blob_store *bs, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + struct spdk_bs_type bstype; + static const struct spdk_bs_type blobfs_type = {"BLOBFS"}; + static const struct spdk_bs_type zeros; + + if (bserrno != 0) { + args->fn.fs_op_with_handle(args->arg, NULL, bserrno); + free_fs_request(req); + free(fs); + return; + } + + bstype = spdk_bs_get_bstype(bs); + + if (!memcmp(&bstype, &zeros, sizeof(bstype))) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "assigning bstype\n"); + spdk_bs_set_bstype(bs, blobfs_type); + } else if (memcmp(&bstype, &blobfs_type, sizeof(bstype))) { + SPDK_DEBUGLOG(SPDK_LOG_BLOB, "not blobfs\n"); + SPDK_TRACEDUMP(SPDK_LOG_BLOB, "bstype", &bstype, sizeof(bstype)); + args->fn.fs_op_with_handle(args->arg, NULL, bserrno); + free_fs_request(req); + free(fs); + return; + } + + common_fs_bs_init(fs, bs); + fs_load_done(req, 0); +} + +static void +spdk_fs_io_device_unregister(struct spdk_filesystem *fs) +{ + assert(fs != NULL); + spdk_io_device_unregister(&fs->md_target, NULL); + spdk_io_device_unregister(&fs->sync_target, NULL); + spdk_io_device_unregister(&fs->io_target, NULL); + free(fs); +} + +static void +spdk_fs_free_io_channels(struct spdk_filesystem *fs) +{ + assert(fs != NULL); + spdk_fs_free_io_channel(fs->md_target.md_io_channel); + spdk_fs_free_io_channel(fs->sync_target.sync_io_channel); +} + +void +spdk_fs_load(struct spdk_bs_dev *dev, fs_send_request_fn send_request_fn, + spdk_fs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + struct spdk_fs_cb_args *args; + struct spdk_fs_request *req; + struct spdk_bs_opts bs_opts; + + fs = fs_alloc(dev, send_request_fn); + if (fs == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + fs_conf_parse(); + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + spdk_fs_free_io_channels(fs); + spdk_fs_io_device_unregister(fs); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op_with_handle = cb_fn; + args->arg = cb_arg; + args->fs = fs; + TAILQ_INIT(&args->op.fs_load.deleted_files); + spdk_bs_opts_init(&bs_opts); + bs_opts.iter_cb_fn = iter_cb; + bs_opts.iter_cb_arg = req; + spdk_bs_load(dev, &bs_opts, load_cb, req); +} + +static void +unload_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_filesystem *fs = args->fs; + struct spdk_file *file, *tmp; + + TAILQ_FOREACH_SAFE(file, &fs->files, tailq, tmp) { + TAILQ_REMOVE(&fs->files, file, tailq); + cache_free_buffers(file); + free(file->name); + free(file->tree); + free(file); + } + + pthread_mutex_lock(&g_cache_init_lock); + g_fs_count--; + if (g_fs_count == 0) { + __free_cache(); + } + pthread_mutex_unlock(&g_cache_init_lock); + + args->fn.fs_op(args->arg, bserrno); + free(req); + + spdk_fs_io_device_unregister(fs); +} + +void +spdk_fs_unload(struct spdk_filesystem *fs, spdk_fs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + /* + * We must free the md_channel before unloading the blobstore, so just + * allocate this request from the general heap. + */ + req = calloc(1, sizeof(*req)); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op = cb_fn; + args->arg = cb_arg; + args->fs = fs; + + spdk_fs_free_io_channels(fs); + spdk_bs_unload(fs->bs, unload_cb, req); +} + +static struct spdk_file * +fs_find_file(struct spdk_filesystem *fs, const char *name) +{ + struct spdk_file *file; + + TAILQ_FOREACH(file, &fs->files, tailq) { + if (!strncmp(name, file->name, SPDK_FILE_NAME_MAX)) { + return file; + } + } + + return NULL; +} + +void +spdk_fs_file_stat_async(struct spdk_filesystem *fs, const char *name, + spdk_file_stat_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file_stat stat; + struct spdk_file *f = NULL; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, NULL, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f != NULL) { + stat.blobid = f->blobid; + stat.size = f->append_pos >= f->length ? f->append_pos : f->length; + cb_fn(cb_arg, &stat, 0); + return; + } + + cb_fn(cb_arg, NULL, -ENOENT); +} + +static void +__copy_stat(void *arg, struct spdk_file_stat *stat, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + args->rc = fserrno; + if (fserrno == 0) { + memcpy(args->arg, stat, sizeof(*stat)); + } + sem_post(args->sem); +} + +static void +__file_stat(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_fs_file_stat_async(args->fs, args->op.stat.name, + args->fn.stat_op, req); +} + +int +spdk_fs_file_stat(struct spdk_filesystem *fs, struct spdk_io_channel *_channel, + const char *name, struct spdk_file_stat *stat) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + req->args.fs = fs; + req->args.op.stat.name = name; + req->args.fn.stat_op = __copy_stat; + req->args.arg = stat; + req->args.sem = &channel->sem; + channel->send_request(__file_stat, req); + sem_wait(&channel->sem); + + rc = req->args.rc; + free_fs_request(req); + + return rc; +} + +static void +fs_create_blob_close_cb(void *ctx, int bserrno) +{ + int rc; + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + rc = args->rc ? args->rc : bserrno; + args->fn.file_op(args->arg, rc); + free_fs_request(req); +} + +static void +fs_create_blob_resize_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + struct spdk_blob *blob = args->op.create.blob; + uint64_t length = 0; + + args->rc = bserrno; + if (bserrno) { + spdk_blob_close(blob, fs_create_blob_close_cb, args); + return; + } + + spdk_blob_set_xattr(blob, "name", f->name, strlen(f->name) + 1); + spdk_blob_set_xattr(blob, "length", &length, sizeof(length)); + + spdk_blob_close(blob, fs_create_blob_close_cb, args); +} + +static void +fs_create_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + args->op.create.blob = blob; + spdk_blob_resize(blob, 1, fs_create_blob_resize_cb, req); +} + +static void +fs_create_blob_create_cb(void *ctx, spdk_blob_id blobid, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + f->blobid = blobid; + spdk_bs_open_blob(f->fs->bs, blobid, fs_create_blob_open_cb, req); +} + +void +spdk_fs_create_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *file; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + file = fs_find_file(fs, name); + if (file != NULL) { + cb_fn(cb_arg, -EEXIST); + return; + } + + file = file_alloc(fs); + if (file == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->file = file; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + file->name = strdup(name); + spdk_bs_create_blob(fs->bs, fs_create_blob_create_cb, args); +} + +static void +__fs_create_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + args->rc = fserrno; + sem_post(args->sem); + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name); +} + +static void +__fs_create_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.create.name); + spdk_fs_create_file_async(args->fs, args->op.create.name, __fs_create_file_done, req); +} + +int +spdk_fs_create_file(struct spdk_filesystem *fs, struct spdk_io_channel *_channel, const char *name) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.create.name = name; + args->sem = &channel->sem; + fs->send_request(__fs_create_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +static void +fs_open_blob_done(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f = args->file; + + f->blob = blob; + while (!TAILQ_EMPTY(&f->open_requests)) { + req = TAILQ_FIRST(&f->open_requests); + args = &req->args; + TAILQ_REMOVE(&f->open_requests, req, args.op.open.tailq); + args->fn.file_op_with_handle(args->arg, f, bserrno); + free_fs_request(req); + } +} + +static void +fs_open_blob_create_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + struct spdk_filesystem *fs = args->fs; + + if (file == NULL) { + /* + * This is from an open with CREATE flag - the file + * is now created so look it up in the file list for this + * filesystem. + */ + file = fs_find_file(fs, args->op.open.name); + assert(file != NULL); + args->file = file; + } + + file->ref_count++; + TAILQ_INSERT_TAIL(&file->open_requests, req, args.op.open.tailq); + if (file->ref_count == 1) { + assert(file->blob == NULL); + spdk_bs_open_blob(fs->bs, file->blobid, fs_open_blob_done, req); + } else if (file->blob != NULL) { + fs_open_blob_done(req, file->blob, 0); + } else { + /* + * The blob open for this file is in progress due to a previous + * open request. When that open completes, it will invoke the + * open callback for this request. + */ + } +} + +void +spdk_fs_open_file_async(struct spdk_filesystem *fs, const char *name, uint32_t flags, + spdk_file_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f = NULL; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, NULL, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f == NULL && !(flags & SPDK_BLOBFS_OPEN_CREATE)) { + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + if (f != NULL && f->is_deleted == true) { + cb_fn(cb_arg, NULL, -ENOENT); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op_with_handle = cb_fn; + args->arg = cb_arg; + args->file = f; + args->fs = fs; + args->op.open.name = name; + + if (f == NULL) { + spdk_fs_create_file_async(fs, name, fs_open_blob_create_cb, req); + } else { + fs_open_blob_create_cb(req, 0); + } +} + +static void +__fs_open_file_done(void *arg, struct spdk_file *file, int bserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + args->file = file; + __wake_caller(args, bserrno); + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name); +} + +static void +__fs_open_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", args->op.open.name); + spdk_fs_open_file_async(args->fs, args->op.open.name, args->op.open.flags, + __fs_open_file_done, req); +} + +int +spdk_fs_open_file(struct spdk_filesystem *fs, struct spdk_io_channel *_channel, + const char *name, uint32_t flags, struct spdk_file **file) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.open.name = name; + args->op.open.flags = flags; + args->sem = &channel->sem; + fs->send_request(__fs_open_file, req); + sem_wait(&channel->sem); + rc = args->rc; + if (rc == 0) { + *file = args->file; + } else { + *file = NULL; + } + free_fs_request(req); + + return rc; +} + +static void +fs_rename_blob_close_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.fs_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +fs_rename_blob_open_cb(void *ctx, struct spdk_blob *blob, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + const char *new_name = args->op.rename.new_name; + + spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1); + spdk_blob_close(blob, fs_rename_blob_close_cb, req); +} + +static void +__spdk_fs_md_rename_file(struct spdk_fs_request *req) +{ + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *f; + + f = fs_find_file(args->fs, args->op.rename.old_name); + if (f == NULL) { + args->fn.fs_op(args->arg, -ENOENT); + free_fs_request(req); + return; + } + + free(f->name); + f->name = strdup(args->op.rename.new_name); + args->file = f; + spdk_bs_open_blob(args->fs->bs, f->blobid, fs_rename_blob_open_cb, req); +} + +static void +fs_rename_delete_done(void *arg, int fserrno) +{ + __spdk_fs_md_rename_file(arg); +} + +void +spdk_fs_rename_file_async(struct spdk_filesystem *fs, + const char *old_name, const char *new_name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "old=%s new=%s\n", old_name, new_name); + if (strnlen(new_name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.fs_op = cb_fn; + args->fs = fs; + args->arg = cb_arg; + args->op.rename.old_name = old_name; + args->op.rename.new_name = new_name; + + f = fs_find_file(fs, new_name); + if (f == NULL) { + __spdk_fs_md_rename_file(req); + return; + } + + /* + * The rename overwrites an existing file. So delete the existing file, then + * do the actual rename. + */ + spdk_fs_delete_file_async(fs, new_name, fs_rename_delete_done, req); +} + +static void +__fs_rename_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __wake_caller(args, fserrno); +} + +static void +__fs_rename_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_fs_rename_file_async(args->fs, args->op.rename.old_name, args->op.rename.new_name, + __fs_rename_file_done, req); +} + +int +spdk_fs_rename_file(struct spdk_filesystem *fs, struct spdk_io_channel *_channel, + const char *old_name, const char *new_name) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + + args->fs = fs; + args->op.rename.old_name = old_name; + args->op.rename.new_name = new_name; + args->sem = &channel->sem; + fs->send_request(__fs_rename_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + return rc; +} + +static void +blob_delete_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +void +spdk_fs_delete_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_file *f; + spdk_blob_id blobid; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s\n", name); + + if (strnlen(name, SPDK_FILE_NAME_MAX + 1) == SPDK_FILE_NAME_MAX + 1) { + cb_fn(cb_arg, -ENAMETOOLONG); + return; + } + + f = fs_find_file(fs, name); + if (f == NULL) { + cb_fn(cb_arg, -ENOENT); + return; + } + + req = alloc_fs_request(fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + if (f->ref_count > 0) { + /* If the ref > 0, we mark the file as deleted and delete it when we close it. */ + f->is_deleted = true; + spdk_blob_set_xattr(f->blob, "is_deleted", &f->is_deleted, sizeof(bool)); + spdk_blob_sync_md(f->blob, blob_delete_cb, args); + return; + } + + TAILQ_REMOVE(&fs->files, f, tailq); + + cache_free_buffers(f); + + blobid = f->blobid; + + free(f->name); + free(f->tree); + free(f); + + spdk_bs_delete_blob(fs->bs, blobid, blob_delete_cb, req); +} + +static void +__fs_delete_file_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __wake_caller(args, fserrno); +} + +static void +__fs_delete_file(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_fs_delete_file_async(args->fs, args->op.delete.name, __fs_delete_file_done, req); +} + +int +spdk_fs_delete_file(struct spdk_filesystem *fs, struct spdk_io_channel *_channel, + const char *name) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + args->fs = fs; + args->op.delete.name = name; + args->sem = &channel->sem; + fs->send_request(__fs_delete_file, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +spdk_fs_iter +spdk_fs_iter_first(struct spdk_filesystem *fs) +{ + struct spdk_file *f; + + f = TAILQ_FIRST(&fs->files); + return f; +} + +spdk_fs_iter +spdk_fs_iter_next(spdk_fs_iter iter) +{ + struct spdk_file *f = iter; + + if (f == NULL) { + return NULL; + } + + f = TAILQ_NEXT(f, tailq); + return f; +} + +const char * +spdk_file_get_name(struct spdk_file *file) +{ + return file->name; +} + +uint64_t +spdk_file_get_length(struct spdk_file *file) +{ + assert(file != NULL); + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s length=0x%jx\n", file->name, file->length); + return file->length; +} + +static void +fs_truncate_complete_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +fs_truncate_resize_cb(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + uint64_t *length = &args->op.truncate.length; + + if (bserrno) { + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); + return; + } + + spdk_blob_set_xattr(file->blob, "length", length, sizeof(*length)); + + file->length = *length; + if (file->append_pos > file->length) { + file->append_pos = file->length; + } + + spdk_blob_sync_md(file->blob, fs_truncate_complete_cb, args); +} + +static uint64_t +__bytes_to_clusters(uint64_t length, uint64_t cluster_sz) +{ + return (length + cluster_sz - 1) / cluster_sz; +} + +void +spdk_file_truncate_async(struct spdk_file *file, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_filesystem *fs; + size_t num_clusters; + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s old=0x%jx new=0x%jx\n", file->name, file->length, length); + if (length == file->length) { + cb_fn(cb_arg, 0); + return; + } + + req = alloc_fs_request(file->fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + args->file = file; + args->op.truncate.length = length; + fs = file->fs; + + num_clusters = __bytes_to_clusters(length, fs->bs_opts.cluster_sz); + + spdk_blob_resize(file->blob, num_clusters, fs_truncate_resize_cb, req); +} + +static void +__truncate(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + spdk_file_truncate_async(args->file, args->op.truncate.length, + args->fn.file_op, args); +} + +int +spdk_file_truncate(struct spdk_file *file, struct spdk_io_channel *_channel, + uint64_t length) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + int rc; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + + args->file = file; + args->op.truncate.length = length; + args->fn.file_op = __wake_caller; + args->sem = &channel->sem; + + channel->send_request(__truncate, req); + sem_wait(&channel->sem); + rc = args->rc; + free_fs_request(req); + + return rc; +} + +static void +__rw_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + spdk_dma_free(args->op.rw.pin_buf); + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +__read_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + assert(req != NULL); + if (args->op.rw.is_read) { + memcpy(args->op.rw.user_buf, + args->op.rw.pin_buf + (args->op.rw.offset & (args->op.rw.blocklen - 1)), + args->op.rw.length); + __rw_done(req, 0); + } else { + memcpy(args->op.rw.pin_buf + (args->op.rw.offset & (args->op.rw.blocklen - 1)), + args->op.rw.user_buf, + args->op.rw.length); + spdk_blob_io_write(args->file->blob, args->op.rw.channel, + args->op.rw.pin_buf, + args->op.rw.start_lba, args->op.rw.num_lba, + __rw_done, req); + } +} + +static void +__do_blob_read(void *ctx, int fserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + + if (fserrno) { + __rw_done(req, fserrno); + return; + } + spdk_blob_io_read(args->file->blob, args->op.rw.channel, + args->op.rw.pin_buf, + args->op.rw.start_lba, args->op.rw.num_lba, + __read_done, req); +} + +static void +__get_page_parameters(struct spdk_file *file, uint64_t offset, uint64_t length, + uint64_t *start_lba, uint32_t *lba_size, uint64_t *num_lba) +{ + uint64_t end_lba; + + *lba_size = spdk_bs_get_io_unit_size(file->fs->bs); + *start_lba = offset / *lba_size; + end_lba = (offset + length - 1) / *lba_size; + *num_lba = (end_lba - *start_lba + 1); +} + +static void +__readwrite(struct spdk_file *file, struct spdk_io_channel *_channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg, int is_read) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + uint64_t start_lba, num_lba, pin_buf_length; + uint32_t lba_size; + + if (is_read && offset + length > file->length) { + cb_fn(cb_arg, -EINVAL); + return; + } + + req = alloc_fs_request(channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + args = &req->args; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + args->file = file; + args->op.rw.channel = channel->bs_channel; + args->op.rw.user_buf = payload; + args->op.rw.is_read = is_read; + args->op.rw.offset = offset; + args->op.rw.length = length; + args->op.rw.blocklen = lba_size; + + pin_buf_length = num_lba * lba_size; + args->op.rw.pin_buf = spdk_dma_malloc(pin_buf_length, lba_size, NULL); + if (args->op.rw.pin_buf == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "Failed to allocate buf for: file=%s offset=%jx length=%jx\n", + file->name, offset, length); + free_fs_request(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + args->op.rw.start_lba = start_lba; + args->op.rw.num_lba = num_lba; + + if (!is_read && file->length < offset + length) { + spdk_file_truncate_async(file, offset + length, __do_blob_read, req); + } else { + __do_blob_read(req, 0); + } +} + +void +spdk_file_write_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 0); +} + +void +spdk_file_read_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "file=%s offset=%jx length=%jx\n", + file->name, offset, length); + __readwrite(file, channel, payload, offset, length, cb_fn, cb_arg, 1); +} + +struct spdk_io_channel * +spdk_fs_alloc_io_channel(struct spdk_filesystem *fs) +{ + struct spdk_io_channel *io_channel; + struct spdk_fs_channel *fs_channel; + + io_channel = spdk_get_io_channel(&fs->io_target); + fs_channel = spdk_io_channel_get_ctx(io_channel); + fs_channel->bs_channel = spdk_bs_alloc_io_channel(fs->bs); + fs_channel->send_request = __send_request_direct; + + return io_channel; +} + +struct spdk_io_channel * +spdk_fs_alloc_io_channel_sync(struct spdk_filesystem *fs) +{ + struct spdk_io_channel *io_channel; + struct spdk_fs_channel *fs_channel; + + io_channel = spdk_get_io_channel(&fs->io_target); + fs_channel = spdk_io_channel_get_ctx(io_channel); + fs_channel->send_request = fs->send_request; + fs_channel->sync = 1; + pthread_spin_init(&fs_channel->lock, 0); + + return io_channel; +} + +void +spdk_fs_free_io_channel(struct spdk_io_channel *channel) +{ + spdk_put_io_channel(channel); +} + +void +spdk_fs_set_cache_size(uint64_t size_in_mb) +{ + g_fs_cache_size = size_in_mb * 1024 * 1024; +} + +uint64_t +spdk_fs_get_cache_size(void) +{ + return g_fs_cache_size / (1024 * 1024); +} + +static void __file_flush(void *_args); + +static void * +alloc_cache_memory_buffer(struct spdk_file *context) +{ + struct spdk_file *file; + void *buf; + + buf = spdk_mempool_get(g_cache_pool); + if (buf != NULL) { + return buf; + } + + pthread_spin_lock(&g_caches_lock); + TAILQ_FOREACH(file, &g_caches, cache_tailq) { + if (!file->open_for_writing && + file->priority == SPDK_FILE_PRIORITY_LOW && + file != context) { + break; + } + } + pthread_spin_unlock(&g_caches_lock); + if (file != NULL) { + cache_free_buffers(file); + buf = spdk_mempool_get(g_cache_pool); + if (buf != NULL) { + return buf; + } + } + + pthread_spin_lock(&g_caches_lock); + TAILQ_FOREACH(file, &g_caches, cache_tailq) { + if (!file->open_for_writing && file != context) { + break; + } + } + pthread_spin_unlock(&g_caches_lock); + if (file != NULL) { + cache_free_buffers(file); + buf = spdk_mempool_get(g_cache_pool); + if (buf != NULL) { + return buf; + } + } + + pthread_spin_lock(&g_caches_lock); + TAILQ_FOREACH(file, &g_caches, cache_tailq) { + if (file != context) { + break; + } + } + pthread_spin_unlock(&g_caches_lock); + if (file != NULL) { + cache_free_buffers(file); + buf = spdk_mempool_get(g_cache_pool); + if (buf != NULL) { + return buf; + } + } + + return NULL; +} + +static struct cache_buffer * +cache_insert_buffer(struct spdk_file *file, uint64_t offset) +{ + struct cache_buffer *buf; + int count = 0; + + buf = calloc(1, sizeof(*buf)); + if (buf == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "calloc failed\n"); + return NULL; + } + + buf->buf = alloc_cache_memory_buffer(file); + while (buf->buf == NULL) { + /* + * TODO: alloc_cache_memory_buffer() should eventually free + * some buffers. Need a more sophisticated check here, instead + * of just bailing if 100 tries does not result in getting a + * free buffer. This will involve using the sync channel's + * semaphore to block until a buffer becomes available. + */ + if (count++ == 100) { + SPDK_ERRLOG("could not allocate cache buffer\n"); + assert(false); + free(buf); + return NULL; + } + buf->buf = alloc_cache_memory_buffer(file); + } + + buf->buf_size = CACHE_BUFFER_SIZE; + buf->offset = offset; + + pthread_spin_lock(&g_caches_lock); + if (file->tree->present_mask == 0) { + TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq); + } + file->tree = spdk_tree_insert_buffer(file->tree, buf); + pthread_spin_unlock(&g_caches_lock); + + return buf; +} + +static struct cache_buffer * +cache_append_buffer(struct spdk_file *file) +{ + struct cache_buffer *last; + + assert(file->last == NULL || file->last->bytes_filled == file->last->buf_size); + assert((file->append_pos % CACHE_BUFFER_SIZE) == 0); + + last = cache_insert_buffer(file, file->append_pos); + if (last == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_BLOBFS, "cache_insert_buffer failed\n"); + return NULL; + } + + file->last = last; + + return last; +} + +static void __check_sync_reqs(struct spdk_file *file); + +static void +__file_cache_finish_sync(void *ctx, int bserrno) +{ + struct spdk_file *file = ctx; + struct spdk_fs_request *sync_req; + struct spdk_fs_cb_args *sync_args; + + pthread_spin_lock(&file->lock); + sync_req = TAILQ_FIRST(&file->sync_requests); + sync_args = &sync_req->args; + assert(sync_args->op.sync.offset <= file->length_flushed); + BLOBFS_TRACE(file, "sync done offset=%jx\n", sync_args->op.sync.offset); + TAILQ_REMOVE(&file->sync_requests, sync_req, args.op.sync.tailq); + pthread_spin_unlock(&file->lock); + + sync_args->fn.file_op(sync_args->arg, bserrno); + __check_sync_reqs(file); + + pthread_spin_lock(&file->lock); + free_fs_request(sync_req); + pthread_spin_unlock(&file->lock); +} + +static void +__free_args(struct spdk_fs_cb_args *args) +{ + struct spdk_fs_request *req; + + if (!args->from_request) { + free(args); + } else { + /* Depends on args being at the start of the spdk_fs_request structure. */ + req = (struct spdk_fs_request *)args; + free_fs_request(req); + } +} + +static void +__check_sync_reqs(struct spdk_file *file) +{ + struct spdk_fs_request *sync_req; + + pthread_spin_lock(&file->lock); + + TAILQ_FOREACH(sync_req, &file->sync_requests, args.op.sync.tailq) { + if (sync_req->args.op.sync.offset <= file->length_flushed) { + break; + } + } + + if (sync_req != NULL && !sync_req->args.op.sync.xattr_in_progress) { + BLOBFS_TRACE(file, "set xattr length 0x%jx\n", file->length_flushed); + sync_req->args.op.sync.xattr_in_progress = true; + spdk_blob_set_xattr(file->blob, "length", &file->length_flushed, + sizeof(file->length_flushed)); + + pthread_spin_unlock(&file->lock); + spdk_blob_sync_md(file->blob, __file_cache_finish_sync, file); + } else { + pthread_spin_unlock(&file->lock); + } +} + +static void +__file_flush_done(void *arg, int bserrno) +{ + struct spdk_fs_cb_args *args = arg; + struct spdk_file *file = args->file; + struct cache_buffer *next = args->op.flush.cache_buffer; + + BLOBFS_TRACE(file, "length=%jx\n", args->op.flush.length); + + pthread_spin_lock(&file->lock); + next->in_progress = false; + next->bytes_flushed += args->op.flush.length; + file->length_flushed += args->op.flush.length; + if (file->length_flushed > file->length) { + file->length = file->length_flushed; + } + if (next->bytes_flushed == next->buf_size) { + BLOBFS_TRACE(file, "write buffer fully flushed 0x%jx\n", file->length_flushed); + next = spdk_tree_find_buffer(file->tree, file->length_flushed); + } + + /* + * Assert that there is no cached data that extends past the end of the underlying + * blob. + */ + assert(next == NULL || next->offset < __file_get_blob_size(file) || + next->bytes_filled == 0); + + pthread_spin_unlock(&file->lock); + + __check_sync_reqs(file); + + __file_flush(args); +} + +static void +__file_flush(void *_args) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + struct cache_buffer *next; + uint64_t offset, length, start_lba, num_lba; + uint32_t lba_size; + + pthread_spin_lock(&file->lock); + next = spdk_tree_find_buffer(file->tree, file->length_flushed); + if (next == NULL || next->in_progress) { + /* + * There is either no data to flush, or a flush I/O is already in + * progress. So return immediately - if a flush I/O is in + * progress we will flush more data after that is completed. + */ + __free_args(args); + if (next == NULL) { + /* + * For cases where a file's cache was evicted, and then the + * file was later appended, we will write the data directly + * to disk and bypass cache. So just update length_flushed + * here to reflect that all data was already written to disk. + */ + file->length_flushed = file->append_pos; + } + pthread_spin_unlock(&file->lock); + if (next == NULL) { + /* + * There is no data to flush, but we still need to check for any + * outstanding sync requests to make sure metadata gets updated. + */ + __check_sync_reqs(file); + } + return; + } + + offset = next->offset + next->bytes_flushed; + length = next->bytes_filled - next->bytes_flushed; + if (length == 0) { + __free_args(args); + pthread_spin_unlock(&file->lock); + return; + } + args->op.flush.length = length; + args->op.flush.cache_buffer = next; + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + next->in_progress = true; + BLOBFS_TRACE(file, "offset=%jx length=%jx page start=%jx num=%jx\n", + offset, length, start_lba, num_lba); + pthread_spin_unlock(&file->lock); + spdk_blob_io_write(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel, + next->buf + (start_lba * lba_size) - next->offset, + start_lba, num_lba, __file_flush_done, args); +} + +static void +__file_extend_done(void *arg, int bserrno) +{ + struct spdk_fs_cb_args *args = arg; + + __wake_caller(args, bserrno); +} + +static void +__file_extend_resize_cb(void *_args, int bserrno) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + + if (bserrno) { + __wake_caller(args, bserrno); + return; + } + + spdk_blob_sync_md(file->blob, __file_extend_done, args); +} + +static void +__file_extend_blob(void *_args) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + + spdk_blob_resize(file->blob, args->op.resize.num_clusters, __file_extend_resize_cb, args); +} + +static void +__rw_from_file_done(void *arg, int bserrno) +{ + struct spdk_fs_cb_args *args = arg; + + __wake_caller(args, bserrno); + __free_args(args); +} + +static void +__rw_from_file(void *_args) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + + if (args->op.rw.is_read) { + spdk_file_read_async(file, file->fs->sync_target.sync_io_channel, args->op.rw.user_buf, + args->op.rw.offset, args->op.rw.length, + __rw_from_file_done, args); + } else { + spdk_file_write_async(file, file->fs->sync_target.sync_io_channel, args->op.rw.user_buf, + args->op.rw.offset, args->op.rw.length, + __rw_from_file_done, args); + } +} + +static int +__send_rw_from_file(struct spdk_file *file, sem_t *sem, void *payload, + uint64_t offset, uint64_t length, bool is_read) +{ + struct spdk_fs_cb_args *args; + + args = calloc(1, sizeof(*args)); + if (args == NULL) { + sem_post(sem); + return -ENOMEM; + } + + args->file = file; + args->sem = sem; + args->op.rw.user_buf = payload; + args->op.rw.offset = offset; + args->op.rw.length = length; + args->op.rw.is_read = is_read; + file->fs->send_request(__rw_from_file, args); + return 0; +} + +int +spdk_file_write(struct spdk_file *file, struct spdk_io_channel *_channel, + void *payload, uint64_t offset, uint64_t length) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_cb_args *args; + uint64_t rem_length, copy, blob_size, cluster_sz; + uint32_t cache_buffers_filled = 0; + uint8_t *cur_payload; + struct cache_buffer *last; + + BLOBFS_TRACE_RW(file, "offset=%jx length=%jx\n", offset, length); + + if (length == 0) { + return 0; + } + + if (offset != file->append_pos) { + BLOBFS_TRACE(file, " error offset=%jx append_pos=%jx\n", offset, file->append_pos); + return -EINVAL; + } + + pthread_spin_lock(&file->lock); + file->open_for_writing = true; + + if (file->last == NULL) { + if (file->append_pos % CACHE_BUFFER_SIZE == 0) { + cache_append_buffer(file); + } else { + int rc; + + file->append_pos += length; + pthread_spin_unlock(&file->lock); + rc = __send_rw_from_file(file, &channel->sem, payload, + offset, length, false); + sem_wait(&channel->sem); + return rc; + } + } + + blob_size = __file_get_blob_size(file); + + if ((offset + length) > blob_size) { + struct spdk_fs_cb_args extend_args = {}; + + cluster_sz = file->fs->bs_opts.cluster_sz; + extend_args.sem = &channel->sem; + extend_args.op.resize.num_clusters = __bytes_to_clusters((offset + length), cluster_sz); + extend_args.file = file; + BLOBFS_TRACE(file, "start resize to %u clusters\n", extend_args.op.resize.num_clusters); + pthread_spin_unlock(&file->lock); + file->fs->send_request(__file_extend_blob, &extend_args); + sem_wait(&channel->sem); + if (extend_args.rc) { + return extend_args.rc; + } + } + + last = file->last; + rem_length = length; + cur_payload = payload; + while (rem_length > 0) { + copy = last->buf_size - last->bytes_filled; + if (copy > rem_length) { + copy = rem_length; + } + BLOBFS_TRACE_RW(file, " fill offset=%jx length=%jx\n", file->append_pos, copy); + memcpy(&last->buf[last->bytes_filled], cur_payload, copy); + file->append_pos += copy; + if (file->length < file->append_pos) { + file->length = file->append_pos; + } + cur_payload += copy; + last->bytes_filled += copy; + rem_length -= copy; + if (last->bytes_filled == last->buf_size) { + cache_buffers_filled++; + last = cache_append_buffer(file); + if (last == NULL) { + BLOBFS_TRACE(file, "nomem\n"); + pthread_spin_unlock(&file->lock); + return -ENOMEM; + } + } + } + + pthread_spin_unlock(&file->lock); + + if (cache_buffers_filled == 0) { + return 0; + } + + args = calloc(1, sizeof(*args)); + if (args == NULL) { + return -ENOMEM; + } + + args->file = file; + file->fs->send_request(__file_flush, args); + return 0; +} + +static void +__readahead_done(void *arg, int bserrno) +{ + struct spdk_fs_cb_args *args = arg; + struct cache_buffer *cache_buffer = args->op.readahead.cache_buffer; + struct spdk_file *file = args->file; + + BLOBFS_TRACE(file, "offset=%jx\n", cache_buffer->offset); + + pthread_spin_lock(&file->lock); + cache_buffer->bytes_filled = args->op.readahead.length; + cache_buffer->bytes_flushed = args->op.readahead.length; + cache_buffer->in_progress = false; + pthread_spin_unlock(&file->lock); + + __free_args(args); +} + +static void +__readahead(void *_args) +{ + struct spdk_fs_cb_args *args = _args; + struct spdk_file *file = args->file; + uint64_t offset, length, start_lba, num_lba; + uint32_t lba_size; + + offset = args->op.readahead.offset; + length = args->op.readahead.length; + assert(length > 0); + + __get_page_parameters(file, offset, length, &start_lba, &lba_size, &num_lba); + + BLOBFS_TRACE(file, "offset=%jx length=%jx page start=%jx num=%jx\n", + offset, length, start_lba, num_lba); + spdk_blob_io_read(file->blob, file->fs->sync_target.sync_fs_channel->bs_channel, + args->op.readahead.cache_buffer->buf, + start_lba, num_lba, __readahead_done, args); +} + +static uint64_t +__next_cache_buffer_offset(uint64_t offset) +{ + return (offset + CACHE_BUFFER_SIZE) & ~(CACHE_TREE_LEVEL_MASK(0)); +} + +static void +check_readahead(struct spdk_file *file, uint64_t offset) +{ + struct spdk_fs_cb_args *args; + + offset = __next_cache_buffer_offset(offset); + if (spdk_tree_find_buffer(file->tree, offset) != NULL || file->length <= offset) { + return; + } + + args = calloc(1, sizeof(*args)); + if (args == NULL) { + return; + } + + BLOBFS_TRACE(file, "offset=%jx\n", offset); + + args->file = file; + args->op.readahead.offset = offset; + args->op.readahead.cache_buffer = cache_insert_buffer(file, offset); + if (!args->op.readahead.cache_buffer) { + BLOBFS_TRACE(file, "Cannot allocate buf for offset=%jx\n", offset); + free(args); + return; + } + + args->op.readahead.cache_buffer->in_progress = true; + if (file->length < (offset + CACHE_BUFFER_SIZE)) { + args->op.readahead.length = file->length & (CACHE_BUFFER_SIZE - 1); + } else { + args->op.readahead.length = CACHE_BUFFER_SIZE; + } + file->fs->send_request(__readahead, args); +} + +static int +__file_read(struct spdk_file *file, void *payload, uint64_t offset, uint64_t length, sem_t *sem) +{ + struct cache_buffer *buf; + int rc; + + buf = spdk_tree_find_filled_buffer(file->tree, offset); + if (buf == NULL) { + pthread_spin_unlock(&file->lock); + rc = __send_rw_from_file(file, sem, payload, offset, length, true); + pthread_spin_lock(&file->lock); + return rc; + } + + if ((offset + length) > (buf->offset + buf->bytes_filled)) { + length = buf->offset + buf->bytes_filled - offset; + } + BLOBFS_TRACE(file, "read %p offset=%ju length=%ju\n", payload, offset, length); + memcpy(payload, &buf->buf[offset - buf->offset], length); + if ((offset + length) % CACHE_BUFFER_SIZE == 0) { + pthread_spin_lock(&g_caches_lock); + spdk_tree_remove_buffer(file->tree, buf); + if (file->tree->present_mask == 0) { + TAILQ_REMOVE(&g_caches, file, cache_tailq); + } + pthread_spin_unlock(&g_caches_lock); + } + + sem_post(sem); + return 0; +} + +int64_t +spdk_file_read(struct spdk_file *file, struct spdk_io_channel *_channel, + void *payload, uint64_t offset, uint64_t length) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + uint64_t final_offset, final_length; + uint32_t sub_reads = 0; + int rc = 0; + + pthread_spin_lock(&file->lock); + + BLOBFS_TRACE_RW(file, "offset=%ju length=%ju\n", offset, length); + + file->open_for_writing = false; + + if (length == 0 || offset >= file->append_pos) { + pthread_spin_unlock(&file->lock); + return 0; + } + + if (offset + length > file->append_pos) { + length = file->append_pos - offset; + } + + if (offset != file->next_seq_offset) { + file->seq_byte_count = 0; + } + file->seq_byte_count += length; + file->next_seq_offset = offset + length; + if (file->seq_byte_count >= CACHE_READAHEAD_THRESHOLD) { + check_readahead(file, offset); + check_readahead(file, offset + CACHE_BUFFER_SIZE); + } + + final_length = 0; + final_offset = offset + length; + while (offset < final_offset) { + length = NEXT_CACHE_BUFFER_OFFSET(offset) - offset; + if (length > (final_offset - offset)) { + length = final_offset - offset; + } + rc = __file_read(file, payload, offset, length, &channel->sem); + if (rc == 0) { + final_length += length; + } else { + break; + } + payload += length; + offset += length; + sub_reads++; + } + pthread_spin_unlock(&file->lock); + while (sub_reads-- > 0) { + sem_wait(&channel->sem); + } + if (rc == 0) { + return final_length; + } else { + return rc; + } +} + +static void +_file_sync(struct spdk_file *file, struct spdk_fs_channel *channel, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *sync_req; + struct spdk_fs_request *flush_req; + struct spdk_fs_cb_args *sync_args; + struct spdk_fs_cb_args *flush_args; + + BLOBFS_TRACE(file, "offset=%jx\n", file->append_pos); + + pthread_spin_lock(&file->lock); + if (file->append_pos <= file->length_flushed) { + BLOBFS_TRACE(file, "done - no data to flush\n"); + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, 0); + return; + } + + sync_req = alloc_fs_request(channel); + if (!sync_req) { + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, -ENOMEM); + return; + } + sync_args = &sync_req->args; + + flush_req = alloc_fs_request(channel); + if (!flush_req) { + pthread_spin_unlock(&file->lock); + cb_fn(cb_arg, -ENOMEM); + return; + } + flush_args = &flush_req->args; + + sync_args->file = file; + sync_args->fn.file_op = cb_fn; + sync_args->arg = cb_arg; + sync_args->op.sync.offset = file->append_pos; + sync_args->op.sync.xattr_in_progress = false; + TAILQ_INSERT_TAIL(&file->sync_requests, sync_req, args.op.sync.tailq); + pthread_spin_unlock(&file->lock); + + flush_args->file = file; + channel->send_request(__file_flush, flush_args); +} + +int +spdk_file_sync(struct spdk_file *file, struct spdk_io_channel *_channel) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_cb_args args = {}; + + args.sem = &channel->sem; + _file_sync(file, channel, __wake_caller, &args); + sem_wait(&channel->sem); + + return args.rc; +} + +void +spdk_file_sync_async(struct spdk_file *file, struct spdk_io_channel *_channel, + spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + + _file_sync(file, channel, cb_fn, cb_arg); +} + +void +spdk_file_set_priority(struct spdk_file *file, uint32_t priority) +{ + BLOBFS_TRACE(file, "priority=%u\n", priority); + file->priority = priority; + +} + +/* + * Close routines + */ + +static void +__file_close_async_done(void *ctx, int bserrno) +{ + struct spdk_fs_request *req = ctx; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + + if (file->is_deleted) { + spdk_fs_delete_file_async(file->fs, file->name, blob_delete_cb, ctx); + return; + } + + args->fn.file_op(args->arg, bserrno); + free_fs_request(req); +} + +static void +__file_close_async(struct spdk_file *file, struct spdk_fs_request *req) +{ + struct spdk_blob *blob; + + pthread_spin_lock(&file->lock); + if (file->ref_count == 0) { + pthread_spin_unlock(&file->lock); + __file_close_async_done(req, -EBADF); + return; + } + + file->ref_count--; + if (file->ref_count > 0) { + pthread_spin_unlock(&file->lock); + req->args.fn.file_op(req->args.arg, 0); + free_fs_request(req); + return; + } + + pthread_spin_unlock(&file->lock); + + blob = file->blob; + file->blob = NULL; + spdk_blob_close(blob, __file_close_async_done, req); +} + +static void +__file_close_async__sync_done(void *arg, int fserrno) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + + __file_close_async(args->file, req); +} + +void +spdk_file_close_async(struct spdk_file *file, spdk_file_op_complete cb_fn, void *cb_arg) +{ + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + req = alloc_fs_request(file->fs->md_target.md_fs_channel); + if (req == NULL) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + args = &req->args; + args->file = file; + args->fn.file_op = cb_fn; + args->arg = cb_arg; + + spdk_file_sync_async(file, file->fs->md_target.md_io_channel, __file_close_async__sync_done, req); +} + +static void +__file_close(void *arg) +{ + struct spdk_fs_request *req = arg; + struct spdk_fs_cb_args *args = &req->args; + struct spdk_file *file = args->file; + + __file_close_async(file, req); +} + +int +spdk_file_close(struct spdk_file *file, struct spdk_io_channel *_channel) +{ + struct spdk_fs_channel *channel = spdk_io_channel_get_ctx(_channel); + struct spdk_fs_request *req; + struct spdk_fs_cb_args *args; + + req = alloc_fs_request(channel); + if (req == NULL) { + return -ENOMEM; + } + + args = &req->args; + + spdk_file_sync(file, _channel); + BLOBFS_TRACE(file, "name=%s\n", file->name); + args->file = file; + args->sem = &channel->sem; + args->fn.file_op = __wake_caller; + args->arg = req; + channel->send_request(__file_close, req); + sem_wait(&channel->sem); + + return args->rc; +} + +int +spdk_file_get_id(struct spdk_file *file, void *id, size_t size) +{ + if (size < sizeof(spdk_blob_id)) { + return -EINVAL; + } + + memcpy(id, &file->blobid, sizeof(spdk_blob_id)); + + return sizeof(spdk_blob_id); +} + +static void +cache_free_buffers(struct spdk_file *file) +{ + BLOBFS_TRACE(file, "free=%s\n", file->name); + pthread_spin_lock(&file->lock); + pthread_spin_lock(&g_caches_lock); + if (file->tree->present_mask == 0) { + pthread_spin_unlock(&g_caches_lock); + pthread_spin_unlock(&file->lock); + return; + } + spdk_tree_free_buffers(file->tree); + + TAILQ_REMOVE(&g_caches, file, cache_tailq); + /* If not freed, put it in the end of the queue */ + if (file->tree->present_mask != 0) { + TAILQ_INSERT_TAIL(&g_caches, file, cache_tailq); + } + file->last = NULL; + pthread_spin_unlock(&g_caches_lock); + pthread_spin_unlock(&file->lock); +} + +SPDK_LOG_REGISTER_COMPONENT("blobfs", SPDK_LOG_BLOBFS) +SPDK_LOG_REGISTER_COMPONENT("blobfs_rw", SPDK_LOG_BLOBFS_RW) diff --git a/src/spdk/lib/blobfs/blobfs_internal.h b/src/spdk/lib/blobfs/blobfs_internal.h new file mode 100644 index 00000000..4e2ae395 --- /dev/null +++ b/src/spdk/lib/blobfs/blobfs_internal.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_BLOBFS_INTERNAL_H +#define SPDK_BLOBFS_INTERNAL_H + +#include "tree.h" + +void spdk_fs_file_stat_async(struct spdk_filesystem *fs, const char *name, + spdk_file_stat_op_complete cb_fn, void *cb_arg); +void spdk_fs_create_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_args); +void spdk_fs_open_file_async(struct spdk_filesystem *fs, const char *name, uint32_t flags, + spdk_file_op_with_handle_complete cb_fn, void *cb_arg); +void spdk_file_close_async(struct spdk_file *file, spdk_file_op_complete cb_fn, void *cb_arg); +void spdk_fs_rename_file_async(struct spdk_filesystem *fs, const char *old_name, + const char *new_name, spdk_fs_op_complete cb_fn, + void *cb_arg); +void spdk_fs_delete_file_async(struct spdk_filesystem *fs, const char *name, + spdk_file_op_complete cb_fn, void *cb_arg); +void spdk_file_truncate_async(struct spdk_file *file, uint64_t length, + spdk_file_op_complete cb_fn, void *arg); +void spdk_file_write_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg); +void spdk_file_read_async(struct spdk_file *file, struct spdk_io_channel *channel, + void *payload, uint64_t offset, uint64_t length, + spdk_file_op_complete cb_fn, void *cb_arg); + +/* Sync all dirty cache buffers to the backing block device. For async + * usage models, completion of the sync indicates only that data written + * when the sync command was issued have been flushed to disk - it does + * not guarantee any writes submitted after the sync have been flushed, + * even if those writes are completed before the sync. + */ +void spdk_file_sync_async(struct spdk_file *file, struct spdk_io_channel *channel, + spdk_file_op_complete cb_fn, void *cb_arg); + +#endif /* SPDK_BLOBFS_INTERNAL_H_ */ diff --git a/src/spdk/lib/blobfs/tree.c b/src/spdk/lib/blobfs/tree.c new file mode 100644 index 00000000..ffb6bce6 --- /dev/null +++ b/src/spdk/lib/blobfs/tree.c @@ -0,0 +1,181 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/blobfs.h" +#include "blobfs_internal.h" + +#include "spdk/queue.h" +#include "spdk/assert.h" +#include "spdk/env.h" +#include "spdk_internal/log.h" + +uint32_t g_fs_cache_buffer_shift = CACHE_BUFFER_SHIFT_DEFAULT; + +struct cache_buffer * +spdk_tree_find_buffer(struct cache_tree *tree, uint64_t offset) +{ + uint64_t index; + + while (tree != NULL) { + index = offset / CACHE_TREE_LEVEL_SIZE(tree->level); + if (index >= CACHE_TREE_WIDTH) { + return NULL; + } + if (tree->level == 0) { + return tree->u.buffer[index]; + } else { + offset &= CACHE_TREE_LEVEL_MASK(tree->level); + tree = tree->u.tree[index]; + } + } + + return NULL; +} + +struct cache_buffer * +spdk_tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset) +{ + struct cache_buffer *buf; + + buf = spdk_tree_find_buffer(tree, offset); + if (buf != NULL && buf->bytes_filled > 0) { + return buf; + } else { + return NULL; + } +} + +struct cache_tree * +spdk_tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer) +{ + struct cache_tree *tree; + uint64_t index, offset; + + offset = buffer->offset; + while (offset >= CACHE_TREE_LEVEL_SIZE(root->level + 1)) { + if (root->present_mask != 0) { + tree = calloc(1, sizeof(*tree)); + tree->level = root->level + 1; + tree->u.tree[0] = root; + root = tree; + root->present_mask = 0x1ULL; + } else { + root->level++; + } + } + + tree = root; + while (tree->level > 0) { + index = offset / CACHE_TREE_LEVEL_SIZE(tree->level); + assert(index < CACHE_TREE_WIDTH); + offset &= CACHE_TREE_LEVEL_MASK(tree->level); + if (tree->u.tree[index] == NULL) { + tree->u.tree[index] = calloc(1, sizeof(*tree)); + tree->u.tree[index]->level = tree->level - 1; + tree->present_mask |= (1ULL << index); + } + tree = tree->u.tree[index]; + } + + index = offset / CACHE_BUFFER_SIZE; + assert(index < CACHE_TREE_WIDTH); + assert(tree->u.buffer[index] == NULL); + tree->u.buffer[index] = buffer; + tree->present_mask |= (1ULL << index); + return root; +} + +void +spdk_tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer) +{ + struct cache_tree *child; + uint64_t index; + + index = CACHE_TREE_INDEX(tree->level, buffer->offset); + + if (tree->level == 0) { + assert(tree->u.buffer[index] != NULL); + assert(buffer == tree->u.buffer[index]); + tree->present_mask &= ~(1ULL << index); + tree->u.buffer[index] = NULL; + spdk_cache_buffer_free(buffer); + return; + } + + child = tree->u.tree[index]; + assert(child != NULL); + spdk_tree_remove_buffer(child, buffer); + if (child->present_mask == 0) { + tree->present_mask &= ~(1ULL << index); + tree->u.tree[index] = NULL; + free(child); + } +} + +void +spdk_tree_free_buffers(struct cache_tree *tree) +{ + struct cache_buffer *buffer; + struct cache_tree *child; + uint32_t i; + + if (tree->present_mask == 0) { + return; + } + + if (tree->level == 0) { + for (i = 0; i < CACHE_TREE_WIDTH; i++) { + buffer = tree->u.buffer[i]; + if (buffer != NULL && buffer->in_progress == false && + buffer->bytes_filled == buffer->bytes_flushed) { + spdk_cache_buffer_free(buffer); + tree->u.buffer[i] = NULL; + tree->present_mask &= ~(1ULL << i); + } + } + } else { + for (i = 0; i < CACHE_TREE_WIDTH; i++) { + child = tree->u.tree[i]; + if (child != NULL) { + spdk_tree_free_buffers(child); + if (child->present_mask == 0) { + free(child); + tree->u.tree[i] = NULL; + tree->present_mask &= ~(1ULL << i); + } + } + } + } +} diff --git a/src/spdk/lib/blobfs/tree.h b/src/spdk/lib/blobfs/tree.h new file mode 100644 index 00000000..9bde83c6 --- /dev/null +++ b/src/spdk/lib/blobfs/tree.h @@ -0,0 +1,77 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_TREE_H_ +#define SPDK_TREE_H_ + +struct cache_buffer { + uint8_t *buf; + uint64_t offset; + uint32_t buf_size; + uint32_t bytes_filled; + uint32_t bytes_flushed; + bool in_progress; +}; + +extern uint32_t g_fs_cache_buffer_shift; + +#define CACHE_BUFFER_SHIFT_DEFAULT 18 +#define CACHE_BUFFER_SIZE (1U << g_fs_cache_buffer_shift) +#define NEXT_CACHE_BUFFER_OFFSET(offset) \ + (((offset + CACHE_BUFFER_SIZE) >> g_fs_cache_buffer_shift) << g_fs_cache_buffer_shift) + +#define CACHE_TREE_SHIFT 6 +#define CACHE_TREE_WIDTH (1U << CACHE_TREE_SHIFT) +#define CACHE_TREE_LEVEL_SHIFT(level) (g_fs_cache_buffer_shift + (level) * CACHE_TREE_SHIFT) +#define CACHE_TREE_LEVEL_SIZE(level) (1ULL << CACHE_TREE_LEVEL_SHIFT(level)) +#define CACHE_TREE_LEVEL_MASK(level) (CACHE_TREE_LEVEL_SIZE(level) - 1) +#define CACHE_TREE_INDEX(level, offset) ((offset >> CACHE_TREE_LEVEL_SHIFT(level)) & (CACHE_TREE_WIDTH - 1)) + +struct cache_tree { + uint8_t level; + uint64_t present_mask; + union { + struct cache_buffer *buffer[CACHE_TREE_WIDTH]; + struct cache_tree *tree[CACHE_TREE_WIDTH]; + } u; +}; + +void spdk_cache_buffer_free(struct cache_buffer *cache_buffer); + +struct cache_tree *spdk_tree_insert_buffer(struct cache_tree *root, struct cache_buffer *buffer); +void spdk_tree_free_buffers(struct cache_tree *tree); +struct cache_buffer *spdk_tree_find_buffer(struct cache_tree *tree, uint64_t offset); +struct cache_buffer *spdk_tree_find_filled_buffer(struct cache_tree *tree, uint64_t offset); +void spdk_tree_remove_buffer(struct cache_tree *tree, struct cache_buffer *buffer); + +#endif /* SPDK_TREE_H_ */ diff --git a/src/spdk/lib/conf/Makefile b/src/spdk/lib/conf/Makefile new file mode 100644 index 00000000..0cdfda1d --- /dev/null +++ b/src/spdk/lib/conf/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = conf.c +LIBNAME = conf + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/conf/conf.c b/src/spdk/lib/conf/conf.c new file mode 100644 index 00000000..384b088c --- /dev/null +++ b/src/spdk/lib/conf/conf.c @@ -0,0 +1,684 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/string.h" +#include "spdk/log.h" + +struct spdk_conf_value { + struct spdk_conf_value *next; + char *value; +}; + +struct spdk_conf_item { + struct spdk_conf_item *next; + char *key; + struct spdk_conf_value *val; +}; + +struct spdk_conf_section { + struct spdk_conf_section *next; + char *name; + int num; + struct spdk_conf_item *item; +}; + +struct spdk_conf { + char *file; + struct spdk_conf_section *current_section; + struct spdk_conf_section *section; +}; + +#define CF_DELIM " \t" + +#define LIB_MAX_TMPBUF 1024 + +static struct spdk_conf *default_config = NULL; + +struct spdk_conf * +spdk_conf_allocate(void) +{ + return calloc(1, sizeof(struct spdk_conf)); +} + +static void +free_conf_value(struct spdk_conf_value *vp) +{ + if (vp == NULL) { + return; + } + + if (vp->value) { + free(vp->value); + } + + free(vp); +} + +static void +free_all_conf_value(struct spdk_conf_value *vp) +{ + struct spdk_conf_value *next; + + if (vp == NULL) { + return; + } + + while (vp != NULL) { + next = vp->next; + free_conf_value(vp); + vp = next; + } +} + +static void +free_conf_item(struct spdk_conf_item *ip) +{ + if (ip == NULL) { + return; + } + + if (ip->val != NULL) { + free_all_conf_value(ip->val); + } + + if (ip->key != NULL) { + free(ip->key); + } + + free(ip); +} + +static void +free_all_conf_item(struct spdk_conf_item *ip) +{ + struct spdk_conf_item *next; + + if (ip == NULL) { + return; + } + + while (ip != NULL) { + next = ip->next; + free_conf_item(ip); + ip = next; + } +} + +static void +free_conf_section(struct spdk_conf_section *sp) +{ + if (sp == NULL) { + return; + } + + if (sp->item) { + free_all_conf_item(sp->item); + } + + if (sp->name) { + free(sp->name); + } + + free(sp); +} + +static void +free_all_conf_section(struct spdk_conf_section *sp) +{ + struct spdk_conf_section *next; + + if (sp == NULL) { + return; + } + + while (sp != NULL) { + next = sp->next; + free_conf_section(sp); + sp = next; + } +} + +void +spdk_conf_free(struct spdk_conf *cp) +{ + if (cp == NULL) { + return; + } + + if (cp->section != NULL) { + free_all_conf_section(cp->section); + } + + if (cp->file != NULL) { + free(cp->file); + } + + free(cp); +} + +static struct spdk_conf_section * +allocate_cf_section(void) +{ + return calloc(1, sizeof(struct spdk_conf_section)); +} + +static struct spdk_conf_item * +allocate_cf_item(void) +{ + return calloc(1, sizeof(struct spdk_conf_item)); +} + +static struct spdk_conf_value * +allocate_cf_value(void) +{ + return calloc(1, sizeof(struct spdk_conf_value)); +} + + +#define CHECK_CP_OR_USE_DEFAULT(cp) (((cp) == NULL) && (default_config != NULL)) ? default_config : (cp) + +struct spdk_conf_section * +spdk_conf_find_section(struct spdk_conf *cp, const char *name) +{ + struct spdk_conf_section *sp; + + if (name == NULL || name[0] == '\0') { + return NULL; + } + + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + return NULL; + } + + for (sp = cp->section; sp != NULL; sp = sp->next) { + if (sp->name != NULL && sp->name[0] == name[0] + && strcasecmp(sp->name, name) == 0) { + return sp; + } + } + + return NULL; +} + +struct spdk_conf_section * +spdk_conf_first_section(struct spdk_conf *cp) +{ + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + return NULL; + } + + return cp->section; +} + +struct spdk_conf_section * +spdk_conf_next_section(struct spdk_conf_section *sp) +{ + if (sp == NULL) { + return NULL; + } + + return sp->next; +} + +static void +append_cf_section(struct spdk_conf *cp, struct spdk_conf_section *sp) +{ + struct spdk_conf_section *last; + + cp = CHECK_CP_OR_USE_DEFAULT(cp); + if (cp == NULL) { + SPDK_ERRLOG("cp == NULL\n"); + return; + } + + if (cp->section == NULL) { + cp->section = sp; + return; + } + + for (last = cp->section; last->next != NULL; last = last->next) + ; + last->next = sp; +} + +static struct spdk_conf_item * +find_cf_nitem(struct spdk_conf_section *sp, const char *key, int idx) +{ + struct spdk_conf_item *ip; + int i; + + if (key == NULL || key[0] == '\0') { + return NULL; + } + + i = 0; + for (ip = sp->item; ip != NULL; ip = ip->next) { + if (ip->key != NULL && ip->key[0] == key[0] + && strcasecmp(ip->key, key) == 0) { + if (i == idx) { + return ip; + } + i++; + } + } + + return NULL; +} + +static void +append_cf_item(struct spdk_conf_section *sp, struct spdk_conf_item *ip) +{ + struct spdk_conf_item *last; + + if (sp == NULL) { + return; + } + + if (sp->item == NULL) { + sp->item = ip; + return; + } + + for (last = sp->item; last->next != NULL; last = last->next) + ; + last->next = ip; +} + +static void +append_cf_value(struct spdk_conf_item *ip, struct spdk_conf_value *vp) +{ + struct spdk_conf_value *last; + + if (ip == NULL) { + return; + } + + if (ip->val == NULL) { + ip->val = vp; + return; + } + + for (last = ip->val; last->next != NULL; last = last->next) + ; + last->next = vp; +} + +bool +spdk_conf_section_match_prefix(const struct spdk_conf_section *sp, const char *name_prefix) +{ + return strncasecmp(sp->name, name_prefix, strlen(name_prefix)) == 0; +} + +const char * +spdk_conf_section_get_name(const struct spdk_conf_section *sp) +{ + return sp->name; +} + +int +spdk_conf_section_get_num(const struct spdk_conf_section *sp) +{ + return sp->num; +} + +char * +spdk_conf_section_get_nmval(struct spdk_conf_section *sp, const char *key, int idx1, int idx2) +{ + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + int i; + + ip = find_cf_nitem(sp, key, idx1); + if (ip == NULL) { + return NULL; + } + + vp = ip->val; + if (vp == NULL) { + return NULL; + } + + for (i = 0; vp != NULL; vp = vp->next, i++) { + if (i == idx2) { + return vp->value; + } + } + + return NULL; +} + +char * +spdk_conf_section_get_nval(struct spdk_conf_section *sp, const char *key, int idx) +{ + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + + ip = find_cf_nitem(sp, key, idx); + if (ip == NULL) { + return NULL; + } + + vp = ip->val; + if (vp == NULL) { + return NULL; + } + + return vp->value; +} + +char * +spdk_conf_section_get_val(struct spdk_conf_section *sp, const char *key) +{ + return spdk_conf_section_get_nval(sp, key, 0); +} + +int +spdk_conf_section_get_intval(struct spdk_conf_section *sp, const char *key) +{ + const char *v; + int value; + + v = spdk_conf_section_get_nval(sp, key, 0); + if (v == NULL) { + return -1; + } + + value = (int)strtol(v, NULL, 10); + return value; +} + +bool +spdk_conf_section_get_boolval(struct spdk_conf_section *sp, const char *key, bool default_val) +{ + const char *v; + + v = spdk_conf_section_get_nval(sp, key, 0); + if (v == NULL) { + return default_val; + } + + if (!strcasecmp(v, "Yes") || !strcasecmp(v, "Y") || !strcasecmp(v, "True")) { + return true; + } + + if (!strcasecmp(v, "No") || !strcasecmp(v, "N") || !strcasecmp(v, "False")) { + return false; + } + + return default_val; +} + +static int +parse_line(struct spdk_conf *cp, char *lp) +{ + struct spdk_conf_section *sp; + struct spdk_conf_item *ip; + struct spdk_conf_value *vp; + char *arg; + char *key; + char *val; + char *p; + int num; + + arg = spdk_str_trim(lp); + if (arg == NULL) { + SPDK_ERRLOG("no section\n"); + return -1; + } + + if (arg[0] == '[') { + /* section */ + arg++; + key = spdk_strsepq(&arg, "]"); + if (key == NULL || arg != NULL) { + SPDK_ERRLOG("broken section\n"); + return -1; + } + /* determine section number */ + for (p = key; *p != '\0' && !isdigit((int) *p); p++) + ; + if (*p != '\0') { + num = (int)strtol(p, NULL, 10); + } else { + num = 0; + } + + sp = spdk_conf_find_section(cp, key); + if (sp == NULL) { + sp = allocate_cf_section(); + append_cf_section(cp, sp); + + sp->name = strdup(key); + if (sp->name == NULL) { + SPDK_ERRLOG("cannot duplicate %s to sp->name\n", key); + return -1; + } + } + cp->current_section = sp; + + + sp->num = num; + } else { + /* parameters */ + sp = cp->current_section; + if (sp == NULL) { + SPDK_ERRLOG("unknown section\n"); + return -1; + } + key = spdk_strsepq(&arg, CF_DELIM); + if (key == NULL) { + SPDK_ERRLOG("broken key\n"); + return -1; + } + + ip = allocate_cf_item(); + if (ip == NULL) { + SPDK_ERRLOG("cannot allocate cf item\n"); + return -1; + } + append_cf_item(sp, ip); + ip->key = strdup(key); + if (ip->key == NULL) { + SPDK_ERRLOG("cannot make duplicate of %s\n", key); + return -1; + } + ip->val = NULL; + if (arg != NULL) { + /* key has value(s) */ + while (arg != NULL) { + val = spdk_strsepq(&arg, CF_DELIM); + vp = allocate_cf_value(); + if (vp == NULL) { + SPDK_ERRLOG("cannot allocate cf value\n"); + return -1; + } + append_cf_value(ip, vp); + vp->value = strdup(val); + if (vp->value == NULL) { + SPDK_ERRLOG("cannot duplicate %s to vp->value\n", val); + return -1; + } + } + } + } + + return 0; +} + +static char * +fgets_line(FILE *fp) +{ + char *dst, *dst2, *p; + size_t total, len; + + dst = p = malloc(LIB_MAX_TMPBUF); + if (!dst) { + return NULL; + } + + dst[0] = '\0'; + total = 0; + + while (fgets(p, LIB_MAX_TMPBUF, fp) != NULL) { + len = strlen(p); + total += len; + if (len + 1 < LIB_MAX_TMPBUF || dst[total - 1] == '\n') { + dst2 = realloc(dst, total + 1); + if (!dst2) { + free(dst); + return NULL; + } else { + return dst2; + } + } + + dst2 = realloc(dst, total + LIB_MAX_TMPBUF); + if (!dst2) { + free(dst); + return NULL; + } else { + dst = dst2; + } + + p = dst + total; + } + + if (feof(fp) && total != 0) { + dst2 = realloc(dst, total + 2); + if (!dst2) { + free(dst); + return NULL; + } else { + dst = dst2; + } + + dst[total] = '\n'; + dst[total + 1] = '\0'; + return dst; + } + + free(dst); + + return NULL; +} + +int +spdk_conf_read(struct spdk_conf *cp, const char *file) +{ + FILE *fp; + char *lp, *p; + char *lp2, *q; + int line; + int n, n2; + + if (file == NULL || file[0] == '\0') { + return -1; + } + + fp = fopen(file, "r"); + if (fp == NULL) { + SPDK_ERRLOG("open error: %s\n", file); + return -1; + } + + cp->file = strdup(file); + if (cp->file == NULL) { + SPDK_ERRLOG("cannot duplicate %s to cp->file\n", file); + fclose(fp); + return -1; + } + + line = 1; + while ((lp = fgets_line(fp)) != NULL) { + /* skip spaces */ + for (p = lp; *p != '\0' && isspace((int) *p); p++) + ; + /* skip comment, empty line */ + if (p[0] == '#' || p[0] == '\0') { + goto next_line; + } + + /* concatenate line end with '\' */ + n = strlen(p); + while (n > 2 && p[n - 1] == '\n' && p[n - 2] == '\\') { + n -= 2; + lp2 = fgets_line(fp); + if (lp2 == NULL) { + break; + } + + line++; + n2 = strlen(lp2); + + q = malloc(n + n2 + 1); + if (!q) { + free(lp2); + free(lp); + SPDK_ERRLOG("malloc failed at line %d of %s\n", line, cp->file); + fclose(fp); + return -1; + } + + memcpy(q, p, n); + memcpy(q + n, lp2, n2); + q[n + n2] = '\0'; + free(lp2); + free(lp); + p = lp = q; + n += n2; + } + + /* parse one line */ + if (parse_line(cp, p) < 0) { + SPDK_ERRLOG("parse error at line %d of %s\n", line, cp->file); + } +next_line: + line++; + free(lp); + } + + fclose(fp); + return 0; +} + +void +spdk_conf_set_as_default(struct spdk_conf *cp) +{ + default_config = cp; +} diff --git a/src/spdk/lib/copy/Makefile b/src/spdk/lib/copy/Makefile new file mode 100644 index 00000000..31f983b5 --- /dev/null +++ b/src/spdk/lib/copy/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = copy +C_SRCS = copy_engine.c + +DIRS-y = ioat + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/copy/copy_engine.c b/src/spdk/lib/copy/copy_engine.c new file mode 100644 index 00000000..921e17fa --- /dev/null +++ b/src/spdk/lib/copy/copy_engine.c @@ -0,0 +1,318 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/copy_engine.h" + +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/log.h" +#include "spdk/thread.h" + +static size_t g_max_copy_module_size = 0; + +static struct spdk_copy_engine *hw_copy_engine = NULL; +/* Memcpy engine always exist */ +static struct spdk_copy_engine *mem_copy_engine = NULL; + +TAILQ_HEAD(, spdk_copy_module_if) spdk_copy_module_list = + TAILQ_HEAD_INITIALIZER(spdk_copy_module_list); + +struct copy_io_channel { + struct spdk_copy_engine *engine; + struct spdk_io_channel *ch; +}; + +struct spdk_copy_module_if *g_copy_engine_module = NULL; +spdk_copy_fini_cb g_fini_cb_fn = NULL; +void *g_fini_cb_arg = NULL; + +void +spdk_copy_engine_register(struct spdk_copy_engine *copy_engine) +{ + assert(hw_copy_engine == NULL); + hw_copy_engine = copy_engine; +} + +static void +spdk_memcpy_register(struct spdk_copy_engine *copy_engine) +{ + assert(mem_copy_engine == NULL); + mem_copy_engine = copy_engine; +} + +static void +spdk_memcpy_unregister(void) +{ + mem_copy_engine = NULL; +} + +static void +copy_engine_done(void *ref, int status) +{ + struct spdk_copy_task *req = (struct spdk_copy_task *)ref; + + req->cb(req, status); +} + +int +spdk_copy_submit(struct spdk_copy_task *copy_req, struct spdk_io_channel *ch, + void *dst, void *src, uint64_t nbytes, spdk_copy_completion_cb cb) +{ + struct spdk_copy_task *req = copy_req; + struct copy_io_channel *copy_ch = spdk_io_channel_get_ctx(ch); + + req->cb = cb; + return copy_ch->engine->copy(req->offload_ctx, copy_ch->ch, dst, src, nbytes, + copy_engine_done); +} + +int +spdk_copy_submit_fill(struct spdk_copy_task *copy_req, struct spdk_io_channel *ch, + void *dst, uint8_t fill, uint64_t nbytes, spdk_copy_completion_cb cb) +{ + struct spdk_copy_task *req = copy_req; + struct copy_io_channel *copy_ch = spdk_io_channel_get_ctx(ch); + + req->cb = cb; + return copy_ch->engine->fill(req->offload_ctx, copy_ch->ch, dst, fill, nbytes, + copy_engine_done); +} + +/* memcpy default copy engine */ +static int +mem_copy_submit(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_copy_completion_cb cb) +{ + struct spdk_copy_task *copy_req; + + memcpy(dst, src, (size_t)nbytes); + + copy_req = (struct spdk_copy_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_copy_task, offload_ctx)); + cb(copy_req, 0); + return 0; +} + +static int +mem_copy_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, uint64_t nbytes, + spdk_copy_completion_cb cb) +{ + struct spdk_copy_task *copy_req; + + memset(dst, fill, nbytes); + copy_req = (struct spdk_copy_task *)((uintptr_t)cb_arg - + offsetof(struct spdk_copy_task, offload_ctx)); + cb(copy_req, 0); + + return 0; +} + +static struct spdk_io_channel *mem_get_io_channel(void); + +static struct spdk_copy_engine memcpy_copy_engine = { + .copy = mem_copy_submit, + .fill = mem_copy_fill, + .get_io_channel = mem_get_io_channel, +}; + +static int +memcpy_create_cb(void *io_device, void *ctx_buf) +{ + return 0; +} + +static void +memcpy_destroy_cb(void *io_device, void *ctx_buf) +{ +} + +static struct spdk_io_channel *mem_get_io_channel(void) +{ + return spdk_get_io_channel(&memcpy_copy_engine); +} + +static size_t +copy_engine_mem_get_ctx_size(void) +{ + return sizeof(struct spdk_copy_task); +} + +size_t +spdk_copy_task_size(void) +{ + return g_max_copy_module_size; +} + +void spdk_copy_module_list_add(struct spdk_copy_module_if *copy_module) +{ + TAILQ_INSERT_TAIL(&spdk_copy_module_list, copy_module, tailq); + if (copy_module->get_ctx_size && copy_module->get_ctx_size() > g_max_copy_module_size) { + g_max_copy_module_size = copy_module->get_ctx_size(); + } +} + +static int +copy_create_cb(void *io_device, void *ctx_buf) +{ + struct copy_io_channel *copy_ch = ctx_buf; + + if (hw_copy_engine != NULL) { + copy_ch->ch = hw_copy_engine->get_io_channel(); + if (copy_ch->ch != NULL) { + copy_ch->engine = hw_copy_engine; + return 0; + } + } + + copy_ch->ch = mem_copy_engine->get_io_channel(); + assert(copy_ch->ch != NULL); + copy_ch->engine = mem_copy_engine; + return 0; +} + +static void +copy_destroy_cb(void *io_device, void *ctx_buf) +{ + struct copy_io_channel *copy_ch = ctx_buf; + + spdk_put_io_channel(copy_ch->ch); +} + +struct spdk_io_channel * +spdk_copy_engine_get_io_channel(void) +{ + return spdk_get_io_channel(&spdk_copy_module_list); +} + +static int +copy_engine_mem_init(void) +{ + spdk_memcpy_register(&memcpy_copy_engine); + spdk_io_device_register(&memcpy_copy_engine, memcpy_create_cb, memcpy_destroy_cb, 0, + "memcpy_engine"); + + return 0; +} + +static void +copy_engine_mem_fini(void *ctxt) +{ + spdk_io_device_unregister(&memcpy_copy_engine, NULL); + spdk_memcpy_unregister(); + + spdk_copy_engine_module_finish(); +} + +static void +spdk_copy_engine_module_initialize(void) +{ + struct spdk_copy_module_if *copy_engine_module; + + TAILQ_FOREACH(copy_engine_module, &spdk_copy_module_list, tailq) { + copy_engine_module->module_init(); + } +} + +int +spdk_copy_engine_initialize(void) +{ + spdk_copy_engine_module_initialize(); + /* + * We need a unique identifier for the copy engine framework, so use the + * spdk_copy_module_list address for this purpose. + */ + spdk_io_device_register(&spdk_copy_module_list, copy_create_cb, copy_destroy_cb, + sizeof(struct copy_io_channel), "copy_module"); + + return 0; +} + +static void +spdk_copy_engine_module_finish_cb(void) +{ + spdk_copy_fini_cb cb_fn = g_fini_cb_fn; + + cb_fn(g_fini_cb_arg); + g_fini_cb_fn = NULL; + g_fini_cb_arg = NULL; +} + +void +spdk_copy_engine_module_finish(void) +{ + if (!g_copy_engine_module) { + g_copy_engine_module = TAILQ_FIRST(&spdk_copy_module_list); + } else { + g_copy_engine_module = TAILQ_NEXT(g_copy_engine_module, tailq); + } + + if (!g_copy_engine_module) { + spdk_copy_engine_module_finish_cb(); + return; + } + + if (g_copy_engine_module->module_fini) { + spdk_thread_send_msg(spdk_get_thread(), g_copy_engine_module->module_fini, NULL); + } else { + spdk_copy_engine_module_finish(); + } +} + +void +spdk_copy_engine_finish(spdk_copy_fini_cb cb_fn, void *cb_arg) +{ + assert(cb_fn != NULL); + + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + spdk_io_device_unregister(&spdk_copy_module_list, NULL); + spdk_copy_engine_module_finish(); +} + +void +spdk_copy_engine_config_text(FILE *fp) +{ + struct spdk_copy_module_if *copy_engine_module; + + TAILQ_FOREACH(copy_engine_module, &spdk_copy_module_list, tailq) { + if (copy_engine_module->config_text) { + copy_engine_module->config_text(fp); + } + } +} + +SPDK_COPY_MODULE_REGISTER(copy_engine_mem_init, copy_engine_mem_fini, + NULL, copy_engine_mem_get_ctx_size) diff --git a/src/spdk/lib/copy/ioat/Makefile b/src/spdk/lib/copy/ioat/Makefile new file mode 100644 index 00000000..3d19e38f --- /dev/null +++ b/src/spdk/lib/copy/ioat/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = copy_ioat +C_SRCS = copy_engine_ioat.c copy_engine_ioat_rpc.c + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/copy/ioat/copy_engine_ioat.c b/src/spdk/lib/copy/ioat/copy_engine_ioat.c new file mode 100644 index 00000000..40bc6cf5 --- /dev/null +++ b/src/spdk/lib/copy/ioat/copy_engine_ioat.c @@ -0,0 +1,421 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "copy_engine_ioat.h" + +#include "spdk/stdinc.h" + +#include "spdk_internal/copy_engine.h" +#include "spdk_internal/log.h" + +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/event.h" +#include "spdk/thread.h" +#include "spdk/ioat.h" + +static bool g_ioat_enable = false; + +struct ioat_probe_ctx { + int num_whitelist_devices; + struct spdk_pci_addr whitelist[IOAT_MAX_CHANNELS]; +}; + +static struct ioat_probe_ctx g_probe_ctx; + +struct ioat_device { + struct spdk_ioat_chan *ioat; + bool is_allocated; + /** linked list pointer for device list */ + TAILQ_ENTRY(ioat_device) tailq; +}; + +static TAILQ_HEAD(, ioat_device) g_devices = TAILQ_HEAD_INITIALIZER(g_devices); +static pthread_mutex_t g_ioat_mutex = PTHREAD_MUTEX_INITIALIZER; + +struct ioat_io_channel { + struct spdk_ioat_chan *ioat_ch; + struct ioat_device *ioat_dev; + struct spdk_poller *poller; +}; + +static int +ioat_find_dev_by_whitelist_bdf(const struct spdk_pci_addr *pci_addr, + const struct spdk_pci_addr *whitelist, + int num_whitelist_devices) +{ + int i; + + for (i = 0; i < num_whitelist_devices; i++) { + if (spdk_pci_addr_compare(pci_addr, &whitelist[i]) == 0) { + return 1; + } + } + return 0; +} + +static struct ioat_device * +ioat_allocate_device(void) +{ + struct ioat_device *dev; + + pthread_mutex_lock(&g_ioat_mutex); + TAILQ_FOREACH(dev, &g_devices, tailq) { + if (!dev->is_allocated) { + dev->is_allocated = true; + pthread_mutex_unlock(&g_ioat_mutex); + return dev; + } + } + pthread_mutex_unlock(&g_ioat_mutex); + + return NULL; +} + +static void +ioat_free_device(struct ioat_device *dev) +{ + pthread_mutex_lock(&g_ioat_mutex); + dev->is_allocated = false; + pthread_mutex_unlock(&g_ioat_mutex); +} + +struct ioat_task { + spdk_copy_completion_cb cb; +}; + +static int copy_engine_ioat_init(void); +static void copy_engine_ioat_exit(void *ctx); +static void copy_engine_ioat_config_text(FILE *fp); + +static size_t +copy_engine_ioat_get_ctx_size(void) +{ + return sizeof(struct ioat_task) + sizeof(struct spdk_copy_task); +} + +SPDK_COPY_MODULE_REGISTER(copy_engine_ioat_init, copy_engine_ioat_exit, + copy_engine_ioat_config_text, + copy_engine_ioat_get_ctx_size) + +static void +copy_engine_ioat_exit(void *ctx) +{ + struct ioat_device *dev; + + while (!TAILQ_EMPTY(&g_devices)) { + dev = TAILQ_FIRST(&g_devices); + TAILQ_REMOVE(&g_devices, dev, tailq); + spdk_ioat_detach(dev->ioat); + ioat_free_device(dev); + spdk_dma_free(dev); + } + spdk_copy_engine_module_finish(); +} + +static void +ioat_done(void *cb_arg) +{ + struct spdk_copy_task *copy_req; + struct ioat_task *ioat_task = cb_arg; + + copy_req = (struct spdk_copy_task *) + ((uintptr_t)ioat_task - + offsetof(struct spdk_copy_task, offload_ctx)); + + ioat_task->cb(copy_req, 0); +} + +static int +ioat_copy_submit(void *cb_arg, struct spdk_io_channel *ch, void *dst, void *src, uint64_t nbytes, + spdk_copy_completion_cb cb) +{ + struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + + assert(ioat_ch->ioat_ch != NULL); + + ioat_task->cb = cb; + + return spdk_ioat_submit_copy(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, src, nbytes); +} + +static int +ioat_copy_submit_fill(void *cb_arg, struct spdk_io_channel *ch, void *dst, uint8_t fill, + uint64_t nbytes, spdk_copy_completion_cb cb) +{ + struct ioat_task *ioat_task = (struct ioat_task *)cb_arg; + struct ioat_io_channel *ioat_ch = spdk_io_channel_get_ctx(ch); + uint64_t fill64 = 0x0101010101010101ULL * fill; + + assert(ioat_ch->ioat_ch != NULL); + + ioat_task->cb = cb; + + return spdk_ioat_submit_fill(ioat_ch->ioat_ch, ioat_task, ioat_done, dst, fill64, nbytes); +} + +static int +ioat_poll(void *arg) +{ + struct spdk_ioat_chan *chan = arg; + + spdk_ioat_process_events(chan); + + return -1; +} + +static struct spdk_io_channel *ioat_get_io_channel(void); + +static struct spdk_copy_engine ioat_copy_engine = { + .copy = ioat_copy_submit, + .fill = ioat_copy_submit_fill, + .get_io_channel = ioat_get_io_channel, +}; + +static int +ioat_create_cb(void *io_device, void *ctx_buf) +{ + struct ioat_io_channel *ch = ctx_buf; + struct ioat_device *ioat_dev; + + ioat_dev = ioat_allocate_device(); + if (ioat_dev == NULL) { + return -1; + } + + ch->ioat_dev = ioat_dev; + ch->ioat_ch = ioat_dev->ioat; + ch->poller = spdk_poller_register(ioat_poll, ch->ioat_ch, 0); + return 0; +} + +static void +ioat_destroy_cb(void *io_device, void *ctx_buf) +{ + struct ioat_io_channel *ch = ctx_buf; + + ioat_free_device(ch->ioat_dev); + spdk_poller_unregister(&ch->poller); +} + +static struct spdk_io_channel * +ioat_get_io_channel(void) +{ + return spdk_get_io_channel(&ioat_copy_engine); +} + +static bool +probe_cb(void *cb_ctx, struct spdk_pci_device *pci_dev) +{ + struct ioat_probe_ctx *ctx = cb_ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(pci_dev); + + SPDK_INFOLOG(SPDK_LOG_COPY_IOAT, + " Found matching device at %04x:%02x:%02x.%x vendor:0x%04x device:0x%04x\n", + pci_addr.domain, + pci_addr.bus, + pci_addr.dev, + pci_addr.func, + spdk_pci_device_get_vendor_id(pci_dev), + spdk_pci_device_get_device_id(pci_dev)); + + if (ctx->num_whitelist_devices > 0 && + !ioat_find_dev_by_whitelist_bdf(&pci_addr, ctx->whitelist, ctx->num_whitelist_devices)) { + return false; + } + + /* Claim the device in case conflict with other process */ + if (spdk_pci_device_claim(&pci_addr) < 0) { + return false; + } + + return true; +} + +static void +attach_cb(void *cb_ctx, struct spdk_pci_device *pci_dev, struct spdk_ioat_chan *ioat) +{ + struct ioat_device *dev; + + dev = spdk_dma_zmalloc(sizeof(*dev), 0, NULL); + if (dev == NULL) { + SPDK_ERRLOG("Failed to allocate device struct\n"); + return; + } + + dev->ioat = ioat; + TAILQ_INSERT_TAIL(&g_devices, dev, tailq); +} + +void +copy_engine_ioat_enable_probe(void) +{ + g_ioat_enable = true; +} + +static int +copy_engine_ioat_add_whitelist_device(const char *pci_bdf) +{ + if (pci_bdf == NULL) { + return -1; + } + + if (g_probe_ctx.num_whitelist_devices >= IOAT_MAX_CHANNELS) { + SPDK_ERRLOG("Ioat whitelist is full (max size is %d)\n", + IOAT_MAX_CHANNELS); + return -1; + } + + if (spdk_pci_addr_parse(&g_probe_ctx.whitelist[g_probe_ctx.num_whitelist_devices], + pci_bdf) < 0) { + SPDK_ERRLOG("Invalid address %s\n", pci_bdf); + return -1; + } + + g_probe_ctx.num_whitelist_devices++; + + return 0; +} + +int +copy_engine_ioat_add_whitelist_devices(const char *pci_bdfs[], size_t num_pci_bdfs) +{ + size_t i; + + for (i = 0; i < num_pci_bdfs; i++) { + if (copy_engine_ioat_add_whitelist_device(pci_bdfs[i]) < 0) { + return -1; + } + } + + return 0; +} + +static int +copy_engine_ioat_read_config_file_params(struct spdk_conf_section *sp) +{ + int i; + char *val, *pci_bdf; + + if (spdk_conf_section_get_boolval(sp, "Enable", false)) { + g_ioat_enable = true; + /* Enable Ioat */ + } + + val = spdk_conf_section_get_val(sp, "Disable"); + if (val != NULL) { + SPDK_WARNLOG("\"Disable\" option is deprecated and will be removed in a future release.\n"); + SPDK_WARNLOG("IOAT is now disabled by default. It may be enabled by \"Enable Yes\"\n"); + + if (g_ioat_enable && (strcasecmp(val, "Yes") == 0)) { + SPDK_ERRLOG("\"Enable Yes\" and \"Disable Yes\" cannot be set at the same time\n"); + return -1; + } + } + + /* Init the whitelist */ + for (i = 0; ; i++) { + pci_bdf = spdk_conf_section_get_nmval(sp, "Whitelist", i, 0); + if (!pci_bdf) { + break; + } + + if (copy_engine_ioat_add_whitelist_device(pci_bdf) < 0) { + return -1; + } + } + + return 0; +} + +static int +copy_engine_ioat_init(void) +{ + struct spdk_conf_section *sp; + int rc; + + sp = spdk_conf_find_section(NULL, "Ioat"); + if (sp != NULL) { + rc = copy_engine_ioat_read_config_file_params(sp); + if (rc != 0) { + SPDK_ERRLOG("copy_engine_ioat_read_config_file_params() failed\n"); + return rc; + } + } + + if (!g_ioat_enable) { + return 0; + } + + if (spdk_ioat_probe(&g_probe_ctx, probe_cb, attach_cb) != 0) { + SPDK_ERRLOG("spdk_ioat_probe() failed\n"); + return -1; + } + + SPDK_INFOLOG(SPDK_LOG_COPY_IOAT, "Ioat Copy Engine Offload Enabled\n"); + spdk_copy_engine_register(&ioat_copy_engine); + spdk_io_device_register(&ioat_copy_engine, ioat_create_cb, ioat_destroy_cb, + sizeof(struct ioat_io_channel), "ioat_copy_engine"); + return 0; +} + +#define COPY_ENGINE_IOAT_HEADER_TMPL \ +"[Ioat]\n" \ +" # Users may not want to use offload even it is available.\n" \ +" # Users may use the whitelist to initialize specified devices, IDS\n" \ +" # uses BUS:DEVICE.FUNCTION to identify each Ioat channel.\n" + +#define COPY_ENGINE_IOAT_ENABLE_TMPL \ +" Enable %s\n" + +#define COPY_ENGINE_IOAT_WHITELIST_TMPL \ +" Whitelist %.4" PRIx16 ":%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 "\n" + +static void +copy_engine_ioat_config_text(FILE *fp) +{ + int i; + struct spdk_pci_addr *dev; + + fprintf(fp, COPY_ENGINE_IOAT_HEADER_TMPL); + fprintf(fp, COPY_ENGINE_IOAT_ENABLE_TMPL, g_ioat_enable ? "Yes" : "No"); + + for (i = 0; i < g_probe_ctx.num_whitelist_devices; i++) { + dev = &g_probe_ctx.whitelist[i]; + fprintf(fp, COPY_ENGINE_IOAT_WHITELIST_TMPL, + dev->domain, dev->bus, dev->dev, dev->func); + } +} + +SPDK_LOG_REGISTER_COMPONENT("copy_ioat", SPDK_LOG_COPY_IOAT) diff --git a/src/spdk/lib/copy/ioat/copy_engine_ioat.h b/src/spdk/lib/copy/ioat/copy_engine_ioat.h new file mode 100644 index 00000000..ae69fb2d --- /dev/null +++ b/src/spdk/lib/copy/ioat/copy_engine_ioat.h @@ -0,0 +1,44 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_COPY_ENGINE_IOAT_H +#define SPDK_COPY_ENGINE_IOAT_H + +#include "spdk/stdinc.h" + +#define IOAT_MAX_CHANNELS 64 + +int copy_engine_ioat_add_whitelist_devices(const char *pci_bdfs[], size_t num_pci_bdfs); +void copy_engine_ioat_enable_probe(void); + +#endif /* SPDK_COPY_ENGINE_IOAT_H */ diff --git a/src/spdk/lib/copy/ioat/copy_engine_ioat_rpc.c b/src/spdk/lib/copy/ioat/copy_engine_ioat_rpc.c new file mode 100644 index 00000000..ae03fdb1 --- /dev/null +++ b/src/spdk/lib/copy/ioat/copy_engine_ioat_rpc.c @@ -0,0 +1,118 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "copy_engine_ioat.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/event.h" + +struct rpc_pci_whitelist { + size_t num_bdfs; + char *bdfs[IOAT_MAX_CHANNELS]; +}; + +static int +decode_rpc_pci_whitelist(const struct spdk_json_val *val, void *out) +{ + struct rpc_pci_whitelist *pci_whitelist = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, pci_whitelist->bdfs, + IOAT_MAX_CHANNELS, &pci_whitelist->num_bdfs, sizeof(char *)); +} + +static void +free_rpc_pci_whitelist(struct rpc_pci_whitelist *list) +{ + size_t i; + + for (i = 0; i < list->num_bdfs; i++) { + free(list->bdfs[i]); + } +} + +struct rpc_copy_engine_ioat { + struct rpc_pci_whitelist pci_whitelist; +}; + +static void +free_rpc_copy_engine_ioat(struct rpc_copy_engine_ioat *p) +{ + free_rpc_pci_whitelist(&p->pci_whitelist); +} + +static const struct spdk_json_object_decoder rpc_copy_engine_ioat_decoder[] = { + {"pci_whitelist", offsetof(struct rpc_copy_engine_ioat, pci_whitelist), decode_rpc_pci_whitelist}, +}; + +static void +spdk_rpc_scan_copy_engine_ioat(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_copy_engine_ioat req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_copy_engine_ioat_decoder, + SPDK_COUNTOF(rpc_copy_engine_ioat_decoder), + &req)) { + free_rpc_copy_engine_ioat(&req); + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + rc = copy_engine_ioat_add_whitelist_devices((const char **)req.pci_whitelist.bdfs, + req.pci_whitelist.num_bdfs); + free_rpc_copy_engine_ioat(&req); + if (rc < 0) { + SPDK_ERRLOG("copy_engine_ioat_add_whitelist_devices() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + copy_engine_ioat_enable_probe(); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("scan_ioat_copy_engine", spdk_rpc_scan_copy_engine_ioat, SPDK_RPC_STARTUP) diff --git a/src/spdk/lib/env_dpdk/Makefile b/src/spdk/lib/env_dpdk/Makefile new file mode 100644 index 00000000..b7a6961f --- /dev/null +++ b/src/spdk/lib/env_dpdk/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = env.c memory.c pci.c vtophys.c init.c threads.c +C_SRCS += pci_nvme.c pci_ioat.c pci_virtio.c +LIBNAME = env_dpdk + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/env_dpdk/env.c b/src/spdk/lib/env_dpdk/env.c new file mode 100644 index 00000000..a5238e54 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.c @@ -0,0 +1,419 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" + +#include +#include +#include +#include +#include +#include + +static uint64_t +virt_to_phys(void *vaddr) +{ + uint64_t ret; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + ret = rte_malloc_virt2iova(vaddr); + if (ret != RTE_BAD_IOVA) { + return ret; + } +#else + ret = rte_malloc_virt2phy(vaddr); + if (ret != RTE_BAD_PHYS_ADDR) { + return ret; + } +#endif + + return spdk_vtophys(vaddr); +} + +void * +spdk_malloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + if (flags == 0) { + return NULL; + } + + void *buf = rte_malloc_socket(NULL, size, align, socket_id); + if (buf && phys_addr) { + *phys_addr = virt_to_phys(buf); + } + return buf; +} + +void * +spdk_zmalloc(size_t size, size_t align, uint64_t *phys_addr, int socket_id, uint32_t flags) +{ + void *buf = spdk_malloc(size, align, phys_addr, socket_id, flags); + if (buf) { + memset(buf, 0, size); + } + return buf; +} + +void +spdk_free(void *buf) +{ + rte_free(buf); +} + +void * +spdk_dma_malloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_malloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_zmalloc_socket(size_t size, size_t align, uint64_t *phys_addr, int socket_id) +{ + return spdk_zmalloc(size, align, phys_addr, socket_id, (SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE)); +} + +void * +spdk_dma_malloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_malloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_zmalloc(size_t size, size_t align, uint64_t *phys_addr) +{ + return spdk_dma_zmalloc_socket(size, align, phys_addr, SPDK_ENV_SOCKET_ID_ANY); +} + +void * +spdk_dma_realloc(void *buf, size_t size, size_t align, uint64_t *phys_addr) +{ + void *new_buf = rte_realloc(buf, size, align); + if (new_buf && phys_addr) { + *phys_addr = virt_to_phys(new_buf); + } + return new_buf; +} + +void +spdk_dma_free(void *buf) +{ + spdk_free(buf); +} + +void * +spdk_memzone_reserve_aligned(const char *name, size_t len, int socket_id, + unsigned flags, unsigned align) +{ + const struct rte_memzone *mz; + unsigned dpdk_flags = 0; + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + /* Older DPDKs do not offer such flag since their + * memzones are iova-contiguous by default. + */ + if ((flags & SPDK_MEMZONE_NO_IOVA_CONTIG) == 0) { + dpdk_flags |= RTE_MEMZONE_IOVA_CONTIG; + } +#endif + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + mz = rte_memzone_reserve_aligned(name, len, socket_id, dpdk_flags, align); + + if (mz != NULL) { + memset(mz->addr, 0, len); + return mz->addr; + } else { + return NULL; + } +} + +void * +spdk_memzone_reserve(const char *name, size_t len, int socket_id, unsigned flags) +{ + return spdk_memzone_reserve_aligned(name, len, socket_id, flags, + RTE_CACHE_LINE_SIZE); +} + +void * +spdk_memzone_lookup(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return mz->addr; + } else { + return NULL; + } +} + +int +spdk_memzone_free(const char *name) +{ + const struct rte_memzone *mz = rte_memzone_lookup(name); + + if (mz != NULL) { + return rte_memzone_free(mz); + } + + return -1; +} + +void +spdk_memzone_dump(FILE *f) +{ + rte_memzone_dump(f); +} + +struct spdk_mempool * +spdk_mempool_create_ctor(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id, + spdk_mempool_obj_cb_t *obj_init, void *obj_init_arg) +{ + struct rte_mempool *mp; + size_t tmp; + + if (socket_id == SPDK_ENV_SOCKET_ID_ANY) { + socket_id = SOCKET_ID_ANY; + } + + /* No more than half of all elements can be in cache */ + tmp = (count / 2) / rte_lcore_count(); + if (cache_size > tmp) { + cache_size = tmp; + } + + if (cache_size > RTE_MEMPOOL_CACHE_MAX_SIZE) { + cache_size = RTE_MEMPOOL_CACHE_MAX_SIZE; + } + + mp = rte_mempool_create(name, count, ele_size, cache_size, + 0, NULL, NULL, (rte_mempool_obj_cb_t *)obj_init, obj_init_arg, + socket_id, MEMPOOL_F_NO_PHYS_CONTIG); + + return (struct spdk_mempool *)mp; +} + + +struct spdk_mempool * +spdk_mempool_create(const char *name, size_t count, + size_t ele_size, size_t cache_size, int socket_id) +{ + return spdk_mempool_create_ctor(name, count, ele_size, cache_size, socket_id, + NULL, NULL); +} + +char * +spdk_mempool_get_name(struct spdk_mempool *mp) +{ + return ((struct rte_mempool *)mp)->name; +} + +void +spdk_mempool_free(struct spdk_mempool *mp) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1) + rte_mempool_free((struct rte_mempool *)mp); +#endif +} + +void * +spdk_mempool_get(struct spdk_mempool *mp) +{ + void *ele = NULL; + int rc; + + rc = rte_mempool_get((struct rte_mempool *)mp, &ele); + if (rc != 0) { + return NULL; + } + return ele; +} + +int +spdk_mempool_get_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + return rte_mempool_get_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +void +spdk_mempool_put(struct spdk_mempool *mp, void *ele) +{ + rte_mempool_put((struct rte_mempool *)mp, ele); +} + +void +spdk_mempool_put_bulk(struct spdk_mempool *mp, void **ele_arr, size_t count) +{ + rte_mempool_put_bulk((struct rte_mempool *)mp, ele_arr, count); +} + +size_t +spdk_mempool_count(const struct spdk_mempool *pool) +{ +#if RTE_VERSION < RTE_VERSION_NUM(16, 7, 0, 1) + return rte_mempool_count((struct rte_mempool *)pool); +#else + return rte_mempool_avail_count((struct rte_mempool *)pool); +#endif +} + +bool +spdk_process_is_primary(void) +{ + return (rte_eal_process_type() == RTE_PROC_PRIMARY); +} + +uint64_t spdk_get_ticks(void) +{ + return rte_get_timer_cycles(); +} + +uint64_t spdk_get_ticks_hz(void) +{ + return rte_get_timer_hz(); +} + +void spdk_delay_us(unsigned int us) +{ + rte_delay_us(us); +} + +void +spdk_unaffinitize_thread(void) +{ + rte_cpuset_t new_cpuset; + long num_cores, i; + + CPU_ZERO(&new_cpuset); + + num_cores = sysconf(_SC_NPROCESSORS_CONF); + + /* Create a mask containing all CPUs */ + for (i = 0; i < num_cores; i++) { + CPU_SET(i, &new_cpuset); + } + + rte_thread_set_affinity(&new_cpuset); +} + +void * +spdk_call_unaffinitized(void *cb(void *arg), void *arg) +{ + rte_cpuset_t orig_cpuset; + void *ret; + + if (cb == NULL) { + return NULL; + } + + rte_thread_get_affinity(&orig_cpuset); + + spdk_unaffinitize_thread(); + + ret = cb(arg); + + rte_thread_set_affinity(&orig_cpuset); + + return ret; +} + +struct spdk_ring * +spdk_ring_create(enum spdk_ring_type type, size_t count, int socket_id) +{ + char ring_name[64]; + static uint32_t ring_num = 0; + unsigned flags = 0; + + switch (type) { + case SPDK_RING_TYPE_SP_SC: + flags = RING_F_SP_ENQ | RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_SC: + flags = RING_F_SC_DEQ; + break; + case SPDK_RING_TYPE_MP_MC: + flags = 0; + break; + default: + return NULL; + } + + snprintf(ring_name, sizeof(ring_name), "ring_%u_%d", + __sync_fetch_and_add(&ring_num, 1), getpid()); + + return (struct spdk_ring *)rte_ring_create(ring_name, count, socket_id, flags); +} + +void +spdk_ring_free(struct spdk_ring *ring) +{ + rte_ring_free((struct rte_ring *)ring); +} + +size_t +spdk_ring_count(struct spdk_ring *ring) +{ + return rte_ring_count((struct rte_ring *)ring); +} + +size_t +spdk_ring_enqueue(struct spdk_ring *ring, void **objs, size_t count) +{ + int rc; +#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) + rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count); + if (rc == 0) { + return count; + } + + return 0; +#else + rc = rte_ring_enqueue_bulk((struct rte_ring *)ring, objs, count, NULL); + return rc; +#endif +} + +size_t +spdk_ring_dequeue(struct spdk_ring *ring, void **objs, size_t count) +{ +#if RTE_VERSION < RTE_VERSION_NUM(17, 5, 0, 0) + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count); +#else + return rte_ring_dequeue_burst((struct rte_ring *)ring, objs, count, NULL); +#endif +} diff --git a/src/spdk/lib/env_dpdk/env.mk b/src/spdk/lib/env_dpdk/env.mk new file mode 100644 index 00000000..989bdd11 --- /dev/null +++ b/src/spdk/lib/env_dpdk/env.mk @@ -0,0 +1,112 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This makefile snippet must define the following flags: +# ENV_CFLAGS +# ENV_CXXFLAGS +# ENV_LIBS +# ENV_LINKER_ARGS + +DPDK_DIR = $(CONFIG_DPDK_DIR) + +export DPDK_ABS_DIR = $(abspath $(DPDK_DIR)) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/include/rte_config.h)) +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include +else +DPDK_INC_DIR := $(DPDK_ABS_DIR)/include/dpdk +endif +DPDK_INC := -I$(DPDK_INC_DIR) + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_eal.a)) +DPDK_LIB_EXT = .a +else +DPDK_LIB_EXT = .so +endif + +DPDK_LIB_LIST = rte_eal rte_mempool rte_ring + +# librte_mempool_ring was new added from DPDK 17.05. Link this library used for +# ring based mempool management API. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_mempool_ring.*)) +DPDK_LIB_LIST += rte_mempool_ring +endif + +# librte_malloc was removed after DPDK 2.1. Link this library conditionally based on its +# existence to maintain backward compatibility. +ifneq ($(wildcard $(DPDK_ABS_DIR)/lib/librte_malloc.*),) +DPDK_LIB_LIST += rte_malloc +endif + +# librte_pci and librte_bus_pci were added in DPDK 17.11. Link these libraries conditionally +# based on their existence to maintain backward compatibility. +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_pci.*)) +DPDK_LIB_LIST += rte_pci +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_bus_pci.*)) +DPDK_LIB_LIST += rte_bus_pci +endif + +ifeq ($(CONFIG_CRYPTO),y) +DPDK_LIB_LIST += rte_cryptodev rte_reorder rte_bus_vdev rte_pmd_aesni_mb rte_pmd_qat rte_mbuf +endif + +ifneq (, $(wildcard $(DPDK_ABS_DIR)/lib/librte_kvargs.*)) +DPDK_LIB_LIST += rte_kvargs +endif + +DPDK_LIB = $(DPDK_LIB_LIST:%=$(DPDK_ABS_DIR)/lib/lib%$(DPDK_LIB_EXT)) +ifeq ($(CONFIG_CRYPTO),y) +DPDK_LIB += $(SPDK_ROOT_DIR)/intel-ipsec-mb/libIPSec_MB.a +endif + +# SPDK memory registration requires experimental (deprecated) rte_memory API for DPDK 18.05 +ENV_CFLAGS = $(DPDK_INC) -Wno-deprecated-declarations +ENV_CXXFLAGS = $(ENV_CFLAGS) +ENV_DPDK_FILE = $(call spdk_lib_list_to_static_libs,env_dpdk) +ENV_LIBS = $(ENV_DPDK_FILE) $(DPDK_LIB) +ENV_LINKER_ARGS = $(ENV_DPDK_FILE) -Wl,--whole-archive $(DPDK_LIB) -Wl,--no-whole-archive + +ifneq (,$(wildcard $(DPDK_INC_DIR)/rte_config.h)) +ifneq (,$(shell grep -e "define RTE_LIBRTE_VHOST_NUMA 1" -e "define RTE_EAL_NUMA_AWARE_HUGEPAGES 1" $(DPDK_INC_DIR)/rte_config.h)) +ENV_LINKER_ARGS += -lnuma +endif +endif + +ifeq ($(OS),Linux) +ENV_LINKER_ARGS += -ldl +endif +ifeq ($(OS),FreeBSD) +ENV_LINKER_ARGS += -lexecinfo +endif diff --git a/src/spdk/lib/env_dpdk/env_internal.h b/src/spdk/lib/env_dpdk/env_internal.h new file mode 100644 index 00000000..d95084ea --- /dev/null +++ b/src/spdk/lib/env_dpdk/env_internal.h @@ -0,0 +1,104 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ENV_INTERNAL_H +#define SPDK_ENV_INTERNAL_H + +#include "spdk/stdinc.h" + +#define spdk_pci_device rte_pci_device + +#include "spdk/env.h" + +#include +#include +#include +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 0) +#include +extern struct rte_pci_bus rte_pci_bus; +#endif +#include +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 1) +#include +#endif +#include + +/* x86-64 and ARM userspace virtual addresses use only the low 48 bits [0..47], + * which is enough to cover 256 TB. + */ +#define SHIFT_256TB 48 /* (1 << 48) == 256 TB */ +#define MASK_256TB ((1ULL << SHIFT_256TB) - 1) + +#define SHIFT_1GB 30 /* (1 << 30) == 1 GB */ +#define MASK_1GB ((1ULL << SHIFT_1GB) - 1) + +#define SHIFT_2MB 21 /* (1 << 21) == 2MB */ +#define MASK_2MB ((1ULL << SHIFT_2MB) - 1) +#define VALUE_2MB (1 << SHIFT_2MB) + +#define SHIFT_4KB 12 /* (1 << 12) == 4KB */ +#define MASK_4KB ((1ULL << SHIFT_4KB) - 1) + +struct spdk_pci_enum_ctx { + struct rte_pci_driver driver; + spdk_pci_enum_cb cb_fn; + void *cb_arg; + pthread_mutex_t mtx; + bool is_registered; +}; + +int spdk_pci_device_init(struct rte_pci_driver *driver, struct rte_pci_device *device); +int spdk_pci_device_fini(struct rte_pci_device *device); + +int spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx); +int spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, spdk_pci_enum_cb enum_cb, void *enum_ctx, + struct spdk_pci_addr *pci_address); + +int spdk_mem_map_init(void); +int spdk_vtophys_init(void); + +/** + * Report a DMA-capable PCI device to the vtophys translation code. + * Increases the refcount of active DMA-capable devices managed by SPDK. + * This must be called after a `rte_pci_device` is created. + */ +void spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device); + +/** + * Report the removal of a DMA-capable PCI device to the vtophys translation code. + * Decreases the refcount of active DMA-capable devices managed by SPDK. + * This must be called before a `rte_pci_device` is destroyed. + */ +void spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device); + +#endif diff --git a/src/spdk/lib/env_dpdk/init.c b/src/spdk/lib/env_dpdk/init.c new file mode 100644 index 00000000..1a2fafe1 --- /dev/null +++ b/src/spdk/lib/env_dpdk/init.c @@ -0,0 +1,401 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include "spdk/version.h" + +#include +#include + +#define SPDK_ENV_DPDK_DEFAULT_NAME "spdk" +#define SPDK_ENV_DPDK_DEFAULT_SHM_ID -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_ENV_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_ENV_DPDK_DEFAULT_CORE_MASK "0x1" + +static char **eal_cmdline; +static int eal_cmdline_argcount; + +static char * +_sprintf_alloc(const char *format, ...) +{ + va_list args; + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + va_start(args, format); + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + va_end(args); + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + va_end(args); + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + va_end(args); + return NULL; +} + +static void +spdk_env_unlink_shared_files(void) +{ + /* Starting with DPDK 18.05, there are more files with unpredictable paths + * and filenames. The --no-shconf option prevents from creating them, but + * only for DPDK 18.08+. For DPDK 18.05 we just leave them be. + */ +#if RTE_VERSION < RTE_VERSION_NUM(18, 05, 0, 0) + char buffer[PATH_MAX]; + + snprintf(buffer, PATH_MAX, "/var/run/.spdk_pid%d_hugepage_info", getpid()); + if (unlink(buffer)) { + fprintf(stderr, "Unable to unlink shared memory file: %s. Error code: %d\n", buffer, errno); + } +#endif +} + +void +spdk_env_opts_init(struct spdk_env_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->name = SPDK_ENV_DPDK_DEFAULT_NAME; + opts->core_mask = SPDK_ENV_DPDK_DEFAULT_CORE_MASK; + opts->shm_id = SPDK_ENV_DPDK_DEFAULT_SHM_ID; + opts->mem_size = SPDK_ENV_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_ENV_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_ENV_DPDK_DEFAULT_MEM_CHANNEL; +} + +static void +spdk_free_args(char **args, int argcount) +{ + int i; + + for (i = 0; i < argcount; i++) { + free(args[i]); + } + + if (argcount) { + free(args); + } +} + +static char ** +spdk_push_arg(char *args[], int *argcount, char *arg) +{ + char **tmp; + + if (arg == NULL) { + fprintf(stderr, "%s: NULL arg supplied\n", __func__); + spdk_free_args(args, *argcount); + return NULL; + } + + tmp = realloc(args, sizeof(char *) * (*argcount + 1)); + if (tmp == NULL) { + spdk_free_args(args, *argcount); + return NULL; + } + + tmp[*argcount] = arg; + (*argcount)++; + + return tmp; +} + +static void +spdk_destruct_eal_cmdline(void) +{ + spdk_free_args(eal_cmdline, eal_cmdline_argcount); +} + + +static int +spdk_build_eal_cmdline(const struct spdk_env_opts *opts) +{ + int argcount = 0; + char **args; + + args = NULL; + + /* set the program name */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", opts->name)); + if (args == NULL) { + return -1; + } + + /* disable shared configuration files when in single process mode. This allows for cleaner shutdown */ + if (opts->shm_id < 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s", "--no-shconf")); + if (args == NULL) { + return -1; + } + } + + /* set the coremask */ + /* NOTE: If coremask starts with '[' and ends with ']' it is a core list + */ + if (opts->core_mask[0] == '[') { + char *l_arg = _sprintf_alloc("-l %s", opts->core_mask + 1); + int len = strlen(l_arg); + if (l_arg[len - 1] == ']') { + l_arg[len - 1] = '\0'; + } + args = spdk_push_arg(args, &argcount, l_arg); + } else { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-c %s", opts->core_mask)); + } + + if (args == NULL) { + return -1; + } + + /* set the memory channel number */ + if (opts->mem_channel > 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-n %d", opts->mem_channel)); + if (args == NULL) { + return -1; + } + } + + /* set the memory size */ + if (opts->mem_size >= 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("-m %d", opts->mem_size)); + if (args == NULL) { + return -1; + } + } + + /* set the master core */ + if (opts->master_core > 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--master-lcore=%d", + opts->master_core)); + if (args == NULL) { + return -1; + } + } + + /* set no pci if enabled */ + if (opts->no_pci) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--no-pci")); + if (args == NULL) { + return -1; + } + } + + /* create just one hugetlbfs file */ + if (opts->hugepage_single_segments) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--single-file-segments")); + if (args == NULL) { + return -1; + } + } + + /* unlink hugepages after initialization */ + if (opts->unlink_hugepage) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--huge-unlink")); + if (args == NULL) { + return -1; + } + } + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) && RTE_VERSION < RTE_VERSION_NUM(18, 5, 1, 0) + /* Dynamic memory management is buggy in DPDK 18.05.0. Don't use it. */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--legacy-mem")); + if (args == NULL) { + return -1; + } +#endif + + if (opts->num_pci_addr) { + size_t i; + char bdf[32]; + struct spdk_pci_addr *pci_addr = + opts->pci_blacklist ? opts->pci_blacklist : opts->pci_whitelist; + + for (i = 0; i < opts->num_pci_addr; i++) { + spdk_pci_addr_fmt(bdf, 32, &pci_addr[i]); + args = spdk_push_arg(args, &argcount, _sprintf_alloc("%s=%s", + (opts->pci_blacklist ? "--pci-blacklist" : "--pci-whitelist"), + bdf)); + if (args == NULL) { + return -1; + } + } + } + +#ifdef __linux__ + if (opts->shm_id < 0) { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk_pid%d", + getpid())); + if (args == NULL) { + return -1; + } + } else { + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--file-prefix=spdk%d", + opts->shm_id)); + if (args == NULL) { + return -1; + } + + /* Set the base virtual address - it must be an address that is not in the + * ASAN shadow region, otherwise ASAN-enabled builds will ignore the + * mmap hint. + * + * Ref: https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm + */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--base-virtaddr=0x200000000000")); + if (args == NULL) { + return -1; + } + + /* set the process type */ + args = spdk_push_arg(args, &argcount, _sprintf_alloc("--proc-type=auto")); + if (args == NULL) { + return -1; + } + } +#endif + + eal_cmdline = args; + eal_cmdline_argcount = argcount; + if (atexit(spdk_destruct_eal_cmdline) != 0) { + fprintf(stderr, "Failed to register cleanup handler\n"); + } + + return argcount; +} + +int spdk_env_init(const struct spdk_env_opts *opts) +{ + char **dpdk_args = NULL; + int i, rc; + int orig_optind; + + rc = spdk_build_eal_cmdline(opts); + if (rc < 0) { + fprintf(stderr, "Invalid arguments to initialize DPDK\n"); + return -1; + } + + printf("Starting %s / %s initialization...\n", SPDK_VERSION_STRING, rte_version()); + printf("[ DPDK EAL parameters: "); + for (i = 0; i < eal_cmdline_argcount; i++) { + printf("%s ", eal_cmdline[i]); + } + printf("]\n"); + + /* DPDK rearranges the array we pass to it, so make a copy + * before passing so we can still free the individual strings + * correctly. + */ + dpdk_args = calloc(eal_cmdline_argcount, sizeof(char *)); + if (dpdk_args == NULL) { + fprintf(stderr, "Failed to allocate dpdk_args\n"); + return -1; + } + memcpy(dpdk_args, eal_cmdline, sizeof(char *) * eal_cmdline_argcount); + + fflush(stdout); + orig_optind = optind; + optind = 1; + rc = rte_eal_init(eal_cmdline_argcount, dpdk_args); + optind = orig_optind; + + free(dpdk_args); + + if (rc < 0) { + fprintf(stderr, "Failed to initialize DPDK\n"); + return -1; + } + + if (opts->shm_id < 0 && !opts->hugepage_single_segments) { + /* + * Unlink hugepage and config info files after init. This will ensure they get + * deleted on app exit, even if the app crashes and does not exit normally. + * Only do this when not in multi-process mode, since for multi-process other + * apps will need to open these files. These files are not created for + * "single file segments". + */ + spdk_env_unlink_shared_files(); + } + + if (spdk_mem_map_init() < 0) { + fprintf(stderr, "Failed to allocate mem_map\n"); + return -1; + } + if (spdk_vtophys_init() < 0) { + fprintf(stderr, "Failed to initialize vtophys\n"); + return -1; + } + + return 0; +} diff --git a/src/spdk/lib/env_dpdk/memory.c b/src/spdk/lib/env_dpdk/memory.c new file mode 100644 index 00000000..eaeccb90 --- /dev/null +++ b/src/spdk/lib/env_dpdk/memory.c @@ -0,0 +1,712 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include +#include + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +#define FN_2MB_TO_4KB(fn) (fn << (SHIFT_2MB - SHIFT_4KB)) +#define FN_4KB_TO_2MB(fn) (fn >> (SHIFT_2MB - SHIFT_4KB)) + +#define MAP_256TB_IDX(vfn_2mb) ((vfn_2mb) >> (SHIFT_1GB - SHIFT_2MB)) +#define MAP_1GB_IDX(vfn_2mb) ((vfn_2mb) & ((1ULL << (SHIFT_1GB - SHIFT_2MB)) - 1)) + +/* Page is registered */ +#define REG_MAP_REGISTERED (1ULL << 62) + +/* A notification region barrier. The 2MB translation entry that's marked + * with this flag must be unregistered separately. This allows contiguous + * regions to be unregistered in the same chunks they were registered. + */ +#define REG_MAP_NOTIFY_START (1ULL << 63) + +/* Translation of a single 2MB page. */ +struct map_2mb { + uint64_t translation_2mb; +}; + +/* Second-level map table indexed by bits [21..29] of the virtual address. + * Each entry contains the address translation or error for entries that haven't + * been retrieved yet. + */ +struct map_1gb { + struct map_2mb map[1ULL << (SHIFT_1GB - SHIFT_2MB)]; +}; + +/* Top-level map table indexed by bits [30..47] of the virtual address. + * Each entry points to a second-level map table or NULL. + */ +struct map_256tb { + struct map_1gb *map[1ULL << (SHIFT_256TB - SHIFT_1GB)]; +}; + +/* Page-granularity memory address translation */ +struct spdk_mem_map { + struct map_256tb map_256tb; + pthread_mutex_t mutex; + uint64_t default_translation; + struct spdk_mem_map_ops ops; + void *cb_ctx; + TAILQ_ENTRY(spdk_mem_map) tailq; +}; + +/* Registrations map. The 64 bit translations are bit fields with the + * following layout (starting with the low bits): + * 0 - 61 : reserved + * 62 - 63 : flags + */ +static struct spdk_mem_map *g_mem_reg_map; +static TAILQ_HEAD(, spdk_mem_map) g_spdk_mem_maps = TAILQ_HEAD_INITIALIZER(g_spdk_mem_maps); +static pthread_mutex_t g_spdk_mem_map_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* + * Walk the currently registered memory via the main memory registration map + * and call the new map's notify callback for each virtually contiguous region. + */ +static int +spdk_mem_map_notify_walk(struct spdk_mem_map *map, enum spdk_mem_map_notify_action action) +{ + size_t idx_256tb; + uint64_t idx_1gb; + uint64_t contig_start = UINT64_MAX; + uint64_t contig_end = UINT64_MAX; + struct map_1gb *map_1gb; + int rc; + + if (!g_mem_reg_map) { + return -EINVAL; + } + + /* Hold the memory registration map mutex so no new registrations can be added while we are looping. */ + pthread_mutex_lock(&g_mem_reg_map->mutex); + + for (idx_256tb = 0; + idx_256tb < sizeof(g_mem_reg_map->map_256tb.map) / sizeof(g_mem_reg_map->map_256tb.map[0]); + idx_256tb++) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + } + contig_start = UINT64_MAX; + continue; + } + + for (idx_1gb = 0; idx_1gb < sizeof(map_1gb->map) / sizeof(map_1gb->map[0]); idx_1gb++) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_start == UINT64_MAX || + (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_start == UINT64_MAX) { + contig_start = vaddr; + } + + contig_end = vaddr; + } else { + if (contig_start != UINT64_MAX) { + /* End of of a virtually contiguous range */ + rc = map->ops.notify_cb(map->cb_ctx, map, action, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + /* Don't bother handling unregister failures. It can't be any worse */ + if (rc != 0 && action == SPDK_MEM_MAP_NOTIFY_REGISTER) { + goto err_unregister; + } + + /* This page might be a part of a neighbour region, so process + * it again. The idx_1gb will be incremented immediately. + */ + idx_1gb--; + } + contig_start = UINT64_MAX; + } + } + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return 0; + +err_unregister: + /* Unwind to the first empty translation so we don't unregister + * a region that just failed to register. + */ + idx_256tb = MAP_256TB_IDX((contig_start >> SHIFT_2MB) - 1); + idx_1gb = MAP_1GB_IDX((contig_start >> SHIFT_2MB) - 1); + contig_start = UINT64_MAX; + contig_end = UINT64_MAX; + + /* Unregister any memory we managed to register before the failure */ + for (; idx_256tb < SIZE_MAX; idx_256tb--) { + map_1gb = g_mem_reg_map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + } + contig_end = UINT64_MAX; + continue; + } + + for (; idx_1gb < UINT64_MAX; idx_1gb--) { + if ((map_1gb->map[idx_1gb].translation_2mb & REG_MAP_REGISTERED) && + (contig_end == UINT64_MAX || (map_1gb->map[idx_1gb].translation_2mb & REG_MAP_NOTIFY_START) == 0)) { + /* Rebuild the virtual address from the indexes */ + uint64_t vaddr = (idx_256tb << SHIFT_1GB) | (idx_1gb << SHIFT_2MB); + + if (contig_end == UINT64_MAX) { + contig_end = vaddr; + } + contig_start = vaddr; + } else { + if (contig_end != UINT64_MAX) { + /* End of of a virtually contiguous range */ + map->ops.notify_cb(map->cb_ctx, map, + SPDK_MEM_MAP_NOTIFY_UNREGISTER, + (void *)contig_start, + contig_end - contig_start + VALUE_2MB); + idx_1gb++; + } + contig_end = UINT64_MAX; + } + } + idx_1gb = sizeof(map_1gb->map) / sizeof(map_1gb->map[0]) - 1; + } + + pthread_mutex_unlock(&g_mem_reg_map->mutex); + return rc; +} + +struct spdk_mem_map * +spdk_mem_map_alloc(uint64_t default_translation, const struct spdk_mem_map_ops *ops, void *cb_ctx) +{ + struct spdk_mem_map *map; + int rc; + + map = calloc(1, sizeof(*map)); + if (map == NULL) { + return NULL; + } + + if (pthread_mutex_init(&map->mutex, NULL)) { + free(map); + return NULL; + } + + map->default_translation = default_translation; + map->cb_ctx = cb_ctx; + if (ops) { + map->ops = *ops; + } + + if (ops && ops->notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + rc = spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_REGISTER); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + DEBUG_PRINT("Initial mem_map notify failed\n"); + pthread_mutex_destroy(&map->mutex); + free(map); + return NULL; + } + TAILQ_INSERT_TAIL(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + return map; +} + +void +spdk_mem_map_free(struct spdk_mem_map **pmap) +{ + struct spdk_mem_map *map; + size_t i; + + if (!pmap) { + return; + } + + map = *pmap; + + if (!map) { + return; + } + + if (map->ops.notify_cb) { + pthread_mutex_lock(&g_spdk_mem_map_mutex); + spdk_mem_map_notify_walk(map, SPDK_MEM_MAP_NOTIFY_UNREGISTER); + TAILQ_REMOVE(&g_spdk_mem_maps, map, tailq); + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + } + + for (i = 0; i < sizeof(map->map_256tb.map) / sizeof(map->map_256tb.map[0]); i++) { + free(map->map_256tb.map[i]); + } + + pthread_mutex_destroy(&map->mutex); + + free(map); + *pmap = NULL; +} + +int +spdk_mem_register(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + if (len == 0) { + return 0; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if (reg & REG_MAP_REGISTERED) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EBUSY; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + seg_vaddr = vaddr; + seg_len = 0; + while (len > 0) { + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, + seg_len == 0 ? REG_MAP_REGISTERED | REG_MAP_NOTIFY_START : REG_MAP_REGISTERED); + seg_len += VALUE_2MB; + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_REGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +int +spdk_mem_unregister(void *vaddr, size_t len) +{ + struct spdk_mem_map *map; + int rc; + void *seg_vaddr; + size_t seg_len; + uint64_t reg, newreg; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + pthread_mutex_lock(&g_spdk_mem_map_mutex); + + /* The first page must be a start of a region. Also check if it's + * registered to make sure we don't return -ERANGE for non-registered + * regions. + */ + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) && (reg & REG_MAP_NOTIFY_START) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + + seg_vaddr = vaddr; + seg_len = len; + while (seg_len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + if ((reg & REG_MAP_REGISTERED) == 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -EINVAL; + } + seg_vaddr += VALUE_2MB; + seg_len -= VALUE_2MB; + } + + newreg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)seg_vaddr, NULL); + /* If the next page is registered, it must be a start of a region as well, + * otherwise we'd be unregistering only a part of a region. + */ + if ((newreg & REG_MAP_NOTIFY_START) == 0 && (newreg & REG_MAP_REGISTERED)) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return -ERANGE; + } + seg_vaddr = vaddr; + seg_len = 0; + + while (len > 0) { + reg = spdk_mem_map_translate(g_mem_reg_map, (uint64_t)vaddr, NULL); + spdk_mem_map_set_translation(g_mem_reg_map, (uint64_t)vaddr, VALUE_2MB, 0); + + if (seg_len > 0 && (reg & REG_MAP_NOTIFY_START)) { + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + + seg_vaddr = vaddr; + seg_len = VALUE_2MB; + } else { + seg_len += VALUE_2MB; + } + + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + if (seg_len > 0) { + TAILQ_FOREACH(map, &g_spdk_mem_maps, tailq) { + rc = map->ops.notify_cb(map->cb_ctx, map, SPDK_MEM_MAP_NOTIFY_UNREGISTER, seg_vaddr, seg_len); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return rc; + } + } + } + + pthread_mutex_unlock(&g_spdk_mem_map_mutex); + return 0; +} + +static struct map_1gb * +spdk_mem_map_get_map_1gb(struct spdk_mem_map *map, uint64_t vfn_2mb) +{ + struct map_1gb *map_1gb; + uint64_t idx_256tb = MAP_256TB_IDX(vfn_2mb); + size_t i; + + if (spdk_unlikely(idx_256tb >= SPDK_COUNTOF(map->map_256tb.map))) { + return NULL; + } + + map_1gb = map->map_256tb.map[idx_256tb]; + + if (!map_1gb) { + pthread_mutex_lock(&map->mutex); + + /* Recheck to make sure nobody else got the mutex first. */ + map_1gb = map->map_256tb.map[idx_256tb]; + if (!map_1gb) { + map_1gb = malloc(sizeof(struct map_1gb)); + if (map_1gb) { + /* initialize all entries to default translation */ + for (i = 0; i < SPDK_COUNTOF(map_1gb->map); i++) { + map_1gb->map[i].translation_2mb = map->default_translation; + } + map->map_256tb.map[idx_256tb] = map_1gb; + } + } + + pthread_mutex_unlock(&map->mutex); + + if (!map_1gb) { + DEBUG_PRINT("allocation failed\n"); + return NULL; + } + } + + return map_1gb; +} + +int +spdk_mem_map_set_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size, + uint64_t translation) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +int +spdk_mem_map_clear_translation(struct spdk_mem_map *map, uint64_t vaddr, uint64_t size) +{ + uint64_t vfn_2mb; + struct map_1gb *map_1gb; + uint64_t idx_1gb; + struct map_2mb *map_2mb; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %lu\n", vaddr); + return -EINVAL; + } + + /* For now, only 2 MB-aligned registrations are supported */ + if (((uintptr_t)vaddr & MASK_2MB) || (size & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%lu len=%ju\n", + __func__, vaddr, size); + return -EINVAL; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + + while (size) { + map_1gb = spdk_mem_map_get_map_1gb(map, vfn_2mb); + if (!map_1gb) { + DEBUG_PRINT("could not get %p map\n", (void *)vaddr); + return -ENOMEM; + } + + idx_1gb = MAP_1GB_IDX(vfn_2mb); + map_2mb = &map_1gb->map[idx_1gb]; + map_2mb->translation_2mb = map->default_translation; + + size -= VALUE_2MB; + vfn_2mb++; + } + + return 0; +} + +uint64_t +spdk_mem_map_translate(const struct spdk_mem_map *map, uint64_t vaddr, uint64_t *size) +{ + const struct map_1gb *map_1gb; + const struct map_2mb *map_2mb; + uint64_t idx_256tb; + uint64_t idx_1gb; + uint64_t vfn_2mb; + uint64_t total_size = 0; + uint64_t cur_size; + uint64_t prev_translation; + + if (size != NULL) { + total_size = *size; + *size = 0; + } + + if (spdk_unlikely(vaddr & ~MASK_256TB)) { + DEBUG_PRINT("invalid usermode virtual address %p\n", (void *)vaddr); + return map->default_translation; + } + + vfn_2mb = vaddr >> SHIFT_2MB; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + return map->default_translation; + } + + cur_size = VALUE_2MB; + if (size != NULL) { + *size = VALUE_2MB; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (size == NULL || map->ops.are_contiguous == NULL || + map_2mb->translation_2mb == map->default_translation) { + return map_2mb->translation_2mb; + } + + prev_translation = map_2mb->translation_2mb;; + while (cur_size < total_size) { + vfn_2mb++; + idx_256tb = MAP_256TB_IDX(vfn_2mb); + idx_1gb = MAP_1GB_IDX(vfn_2mb); + + map_1gb = map->map_256tb.map[idx_256tb]; + if (spdk_unlikely(!map_1gb)) { + break; + } + + map_2mb = &map_1gb->map[idx_1gb]; + if (!map->ops.are_contiguous(prev_translation, map_2mb->translation_2mb)) { + break; + } + + cur_size += VALUE_2MB; + prev_translation = map_2mb->translation_2mb; + } + + *size = cur_size; + return prev_translation; +} + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) +static void +memory_hotplug_cb(enum rte_mem_event event_type, + const void *addr, size_t len, void *arg) +{ + if (event_type == RTE_MEM_EVENT_ALLOC) { + while (len > 0) { + struct rte_memseg *seg; + + seg = rte_mem_virt2memseg(addr, NULL); + assert(seg != NULL); + assert(len >= seg->hugepage_sz); + + spdk_mem_register((void *)seg->addr, seg->hugepage_sz); + addr = (void *)((uintptr_t)addr + seg->hugepage_sz); + len -= seg->hugepage_sz; + } + } else if (event_type == RTE_MEM_EVENT_FREE) { + spdk_mem_unregister((void *)addr, len); + } +} + +static int +memory_iter_cb(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, size_t len, void *arg) +{ + return spdk_mem_register(ms->addr, len); +} +#endif + +int +spdk_mem_map_init(void) +{ + g_mem_reg_map = spdk_mem_map_alloc(0, NULL, NULL); + if (g_mem_reg_map == NULL) { + DEBUG_PRINT("memory registration map allocation failed\n"); + return -1; + } + + /* + * Walk all DPDK memory segments and register them + * with the master memory map + */ +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + rte_mem_event_callback_register("spdk", memory_hotplug_cb, NULL); + rte_memseg_contig_walk(memory_iter_cb, NULL); +#else + struct rte_mem_config *mcfg; + size_t seg_idx; + + mcfg = rte_eal_get_configuration()->mem_config; + for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { + struct rte_memseg *seg = &mcfg->memseg[seg_idx]; + + if (seg->addr == NULL) { + break; + } + + spdk_mem_register(seg->addr, seg->len); + } +#endif + return 0; +} diff --git a/src/spdk/lib/env_dpdk/pci.c b/src/spdk/lib/env_dpdk/pci.c new file mode 100644 index 00000000..4153ac93 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci.c @@ -0,0 +1,551 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/env.h" + +#define SYSFS_PCI_DRIVERS "/sys/bus/pci/drivers" + +#define PCI_CFG_SIZE 256 +#define PCI_EXT_CAP_ID_SN 0x03 + +int +spdk_pci_device_init(struct rte_pci_driver *driver, + struct rte_pci_device *device) +{ + struct spdk_pci_enum_ctx *ctx = (struct spdk_pci_enum_ctx *)driver; + int rc; + + if (!ctx->cb_fn) { +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_unmap_device(device); +#elif RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + rte_eal_pci_unmap_device(device); +#endif + + /* Return a positive value to indicate that this device does not belong to this driver, but + * this isn't an error. */ + return 1; + } + + rc = ctx->cb_fn(ctx->cb_arg, (struct spdk_pci_device *)device); + if (rc != 0) { + return rc; + } + + spdk_vtophys_pci_device_added(device); + return 0; +} + +int +spdk_pci_device_fini(struct rte_pci_device *device) +{ + spdk_vtophys_pci_device_removed(device); + return 0; +} + +void +spdk_pci_device_detach(struct spdk_pci_device *device) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) +#if RTE_VERSION < RTE_VERSION_NUM(17, 05, 0, 0) + rte_eal_device_remove(&device->device); +#endif +#endif + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + struct spdk_pci_addr addr; + char bdf[32]; + + addr.domain = device->addr.domain; + addr.bus = device->addr.bus; + addr.dev = device->addr.devid; + addr.func = device->addr.function; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr); + if (rte_eal_dev_detach(&device->device) < 0) { + fprintf(stderr, "Failed to detach PCI device %s (device already removed?).\n", bdf); + } +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_detach(&device->addr); +#else + rte_eal_pci_detach(&device->addr); +#endif +} + +int +spdk_pci_device_attach(struct spdk_pci_enum_ctx *ctx, + spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + char bdf[32]; + + spdk_pci_addr_fmt(bdf, sizeof(bdf), pci_address); +#else + struct rte_pci_addr addr; + + addr.domain = pci_address->domain; + addr.bus = pci_address->bus; + addr.devid = pci_address->dev; + addr.function = pci_address->func; +#endif + + pthread_mutex_lock(&ctx->mtx); + + if (!ctx->is_registered) { + ctx->is_registered = true; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_register(&ctx->driver); +#else + rte_eal_pci_register(&ctx->driver); +#endif + } + + ctx->cb_fn = enum_cb; + ctx->cb_arg = enum_ctx; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (rte_eal_dev_attach(bdf, "") != 0) { +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + if (rte_pci_probe_one(&addr) != 0) { +#else + if (rte_eal_pci_probe_one(&addr) != 0) { +#endif + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + return -1; + } + + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + + return 0; +} + +/* Note: You can call spdk_pci_enumerate from more than one thread + * simultaneously safely, but you cannot call spdk_pci_enumerate + * and rte_eal_pci_probe simultaneously. + */ +int +spdk_pci_enumerate(struct spdk_pci_enum_ctx *ctx, + spdk_pci_enum_cb enum_cb, + void *enum_ctx) +{ + pthread_mutex_lock(&ctx->mtx); + + if (!ctx->is_registered) { + ctx->is_registered = true; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rte_pci_register(&ctx->driver); +#else + rte_eal_pci_register(&ctx->driver); +#endif + } + + ctx->cb_fn = enum_cb; + ctx->cb_arg = enum_ctx; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (rte_bus_probe() != 0) { +#elif RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + if (rte_pci_probe() != 0) { +#else + if (rte_eal_pci_probe() != 0) { +#endif + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + return -1; + } + + ctx->cb_arg = NULL; + ctx->cb_fn = NULL; + pthread_mutex_unlock(&ctx->mtx); + + return 0; +} + +int +spdk_pci_device_map_bar(struct spdk_pci_device *device, uint32_t bar, + void **mapped_addr, uint64_t *phys_addr, uint64_t *size) +{ + struct rte_pci_device *dev = device; + + *mapped_addr = dev->mem_resource[bar].addr; + *phys_addr = (uint64_t)dev->mem_resource[bar].phys_addr; + *size = (uint64_t)dev->mem_resource[bar].len; + + return 0; +} + +int +spdk_pci_device_unmap_bar(struct spdk_pci_device *device, uint32_t bar, void *addr) +{ + return 0; +} + +uint32_t +spdk_pci_device_get_domain(struct spdk_pci_device *dev) +{ + return dev->addr.domain; +} + +uint8_t +spdk_pci_device_get_bus(struct spdk_pci_device *dev) +{ + return dev->addr.bus; +} + +uint8_t +spdk_pci_device_get_dev(struct spdk_pci_device *dev) +{ + return dev->addr.devid; +} + +uint8_t +spdk_pci_device_get_func(struct spdk_pci_device *dev) +{ + return dev->addr.function; +} + +uint16_t +spdk_pci_device_get_vendor_id(struct spdk_pci_device *dev) +{ + return dev->id.vendor_id; +} + +uint16_t +spdk_pci_device_get_device_id(struct spdk_pci_device *dev) +{ + return dev->id.device_id; +} + +uint16_t +spdk_pci_device_get_subvendor_id(struct spdk_pci_device *dev) +{ + return dev->id.subsystem_vendor_id; +} + +uint16_t +spdk_pci_device_get_subdevice_id(struct spdk_pci_device *dev) +{ + return dev->id.subsystem_device_id; +} + +struct spdk_pci_id +spdk_pci_device_get_id(struct spdk_pci_device *pci_dev) +{ + struct spdk_pci_id pci_id; + + pci_id.vendor_id = spdk_pci_device_get_vendor_id(pci_dev); + pci_id.device_id = spdk_pci_device_get_device_id(pci_dev); + pci_id.subvendor_id = spdk_pci_device_get_subvendor_id(pci_dev); + pci_id.subdevice_id = spdk_pci_device_get_subdevice_id(pci_dev); + + return pci_id; +} + +int +spdk_pci_device_get_socket_id(struct spdk_pci_device *pci_dev) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + return pci_dev->device.numa_node; +#else + return pci_dev->numa_node; +#endif +} + +int +spdk_pci_device_cfg_read(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rc = rte_pci_read_config(dev, value, len, offset); +#else + rc = rte_eal_pci_read_config(dev, value, len, offset); +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +int +spdk_pci_device_cfg_write(struct spdk_pci_device *dev, void *value, uint32_t len, uint32_t offset) +{ + int rc; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 05, 0, 4) + rc = rte_pci_write_config(dev, value, len, offset); +#else + rc = rte_eal_pci_write_config(dev, value, len, offset); +#endif + return (rc > 0 && (uint32_t) rc == len) ? 0 : -1; +} + +int +spdk_pci_device_cfg_read8(struct spdk_pci_device *dev, uint8_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 1, offset); +} + +int +spdk_pci_device_cfg_write8(struct spdk_pci_device *dev, uint8_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 1, offset); +} + +int +spdk_pci_device_cfg_read16(struct spdk_pci_device *dev, uint16_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 2, offset); +} + +int +spdk_pci_device_cfg_write16(struct spdk_pci_device *dev, uint16_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 2, offset); +} + +int +spdk_pci_device_cfg_read32(struct spdk_pci_device *dev, uint32_t *value, uint32_t offset) +{ + return spdk_pci_device_cfg_read(dev, value, 4, offset); +} + +int +spdk_pci_device_cfg_write32(struct spdk_pci_device *dev, uint32_t value, uint32_t offset) +{ + return spdk_pci_device_cfg_write(dev, &value, 4, offset); +} + +int +spdk_pci_device_get_serial_number(struct spdk_pci_device *dev, char *sn, size_t len) +{ + int err; + uint32_t pos, header = 0; + uint32_t i, buf[2]; + + if (len < 17) { + return -1; + } + + err = spdk_pci_device_cfg_read32(dev, &header, PCI_CFG_SIZE); + if (err || !header) { + return -1; + } + + pos = PCI_CFG_SIZE; + while (1) { + if ((header & 0x0000ffff) == PCI_EXT_CAP_ID_SN) { + if (pos) { + /* skip the header */ + pos += 4; + for (i = 0; i < 2; i++) { + err = spdk_pci_device_cfg_read32(dev, &buf[i], pos + 4 * i); + if (err) { + return -1; + } + } + snprintf(sn, len, "%08x%08x", buf[1], buf[0]); + return 0; + } + } + pos = (header >> 20) & 0xffc; + /* 0 if no other items exist */ + if (pos < PCI_CFG_SIZE) { + return -1; + } + err = spdk_pci_device_cfg_read32(dev, &header, pos); + if (err) { + return -1; + } + } + return -1; +} + +struct spdk_pci_addr +spdk_pci_device_get_addr(struct spdk_pci_device *pci_dev) +{ + struct spdk_pci_addr pci_addr; + + pci_addr.domain = spdk_pci_device_get_domain(pci_dev); + pci_addr.bus = spdk_pci_device_get_bus(pci_dev); + pci_addr.dev = spdk_pci_device_get_dev(pci_dev); + pci_addr.func = spdk_pci_device_get_func(pci_dev); + + return pci_addr; +} + +int +spdk_pci_addr_compare(const struct spdk_pci_addr *a1, const struct spdk_pci_addr *a2) +{ + if (a1->domain > a2->domain) { + return 1; + } else if (a1->domain < a2->domain) { + return -1; + } else if (a1->bus > a2->bus) { + return 1; + } else if (a1->bus < a2->bus) { + return -1; + } else if (a1->dev > a2->dev) { + return 1; + } else if (a1->dev < a2->dev) { + return -1; + } else if (a1->func > a2->func) { + return 1; + } else if (a1->func < a2->func) { + return -1; + } + + return 0; +} + +#ifdef __linux__ +int +spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr) +{ + int dev_fd; + char dev_name[64]; + int pid; + void *dev_map; + struct flock pcidev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(dev_name, sizeof(dev_name), "/tmp/spdk_pci_lock_%04x:%02x:%02x.%x", pci_addr->domain, + pci_addr->bus, + pci_addr->dev, pci_addr->func); + + dev_fd = open(dev_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + fprintf(stderr, "could not open %s\n", dev_name); + return -1; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + fprintf(stderr, "could not truncate %s\n", dev_name); + close(dev_fd); + return -1; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + fprintf(stderr, "could not mmap dev %s (%d)\n", dev_name, errno); + close(dev_fd); + return -1; + } + + if (fcntl(dev_fd, F_SETLK, &pcidev_lock) != 0) { + pid = *(int *)dev_map; + fprintf(stderr, "Cannot create lock on device %s, probably" + " process %d has claimed it\n", dev_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + return -1; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + /* Keep dev_fd open to maintain the lock. */ + return dev_fd; +} +#endif /* __linux__ */ + +#ifdef __FreeBSD__ +int +spdk_pci_device_claim(const struct spdk_pci_addr *pci_addr) +{ + /* TODO */ + return 0; +} +#endif /* __FreeBSD__ */ + +int +spdk_pci_addr_parse(struct spdk_pci_addr *addr, const char *bdf) +{ + unsigned domain, bus, dev, func; + + if (addr == NULL || bdf == NULL) { + return -EINVAL; + } + + if ((sscanf(bdf, "%x:%x:%x.%x", &domain, &bus, &dev, &func) == 4) || + (sscanf(bdf, "%x.%x.%x.%x", &domain, &bus, &dev, &func) == 4)) { + /* Matched a full address - all variables are initialized */ + } else if (sscanf(bdf, "%x:%x:%x", &domain, &bus, &dev) == 3) { + func = 0; + } else if ((sscanf(bdf, "%x:%x.%x", &bus, &dev, &func) == 3) || + (sscanf(bdf, "%x.%x.%x", &bus, &dev, &func) == 3)) { + domain = 0; + } else if ((sscanf(bdf, "%x:%x", &bus, &dev) == 2) || + (sscanf(bdf, "%x.%x", &bus, &dev) == 2)) { + domain = 0; + func = 0; + } else { + return -EINVAL; + } + + if (bus > 0xFF || dev > 0x1F || func > 7) { + return -EINVAL; + } + + addr->domain = domain; + addr->bus = bus; + addr->dev = dev; + addr->func = func; + + return 0; +} + +int +spdk_pci_addr_fmt(char *bdf, size_t sz, const struct spdk_pci_addr *addr) +{ + int rc; + + rc = snprintf(bdf, sz, "%04x:%02x:%02x.%x", + addr->domain, addr->bus, + addr->dev, addr->func); + + if (rc > 0 && (size_t)rc < sz) { + return 0; + } + + return -1; +} diff --git a/src/spdk/lib/env_dpdk/pci_ioat.c b/src/spdk/lib/env_dpdk/pci_ioat.c new file mode 100644 index 00000000..b9640283 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_ioat.c @@ -0,0 +1,123 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +#define SPDK_IOAT_PCI_DEVICE(DEVICE_ID) RTE_PCI_DEVICE(SPDK_PCI_VID_INTEL, DEVICE_ID) +static struct rte_pci_id ioat_driver_id[] = { + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SNB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_IVB9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_HSW9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BWD3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDXDE3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX0)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX1)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX2)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX3)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX4)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX5)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX6)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX7)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX8)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_BDX9)}, + {SPDK_IOAT_PCI_DEVICE(PCI_DEVICE_ID_INTEL_IOAT_SKX)}, + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_ioat_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING, + .id_table = ioat_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_ioat", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_ioat", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_ioat_device_attach(spdk_pci_enum_cb enum_cb, void *enum_ctx, + struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_ioat_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_ioat_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_ioat_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/pci_nvme.c b/src/spdk/lib/env_dpdk/pci_nvme.c new file mode 100644 index 00000000..4f3b84d1 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_nvme.c @@ -0,0 +1,89 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct rte_pci_id nvme_pci_driver_id[] = { +#if RTE_VERSION >= RTE_VERSION_NUM(16, 7, 0, 1) + { + .class_id = SPDK_PCI_CLASS_NVME, + .vendor_id = PCI_ANY_ID, + .device_id = PCI_ANY_ID, + .subsystem_vendor_id = PCI_ANY_ID, + .subsystem_device_id = PCI_ANY_ID, + }, +#else + {RTE_PCI_DEVICE(0x8086, 0x0953)}, +#endif + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_nvme_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING +#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) + | RTE_PCI_DRV_WC_ACTIVATE +#endif + , + .id_table = nvme_pci_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_nvme", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_nvme", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_nvme_device_attach(spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_nvme_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_nvme_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_nvme_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/pci_virtio.c b/src/spdk/lib/env_dpdk/pci_virtio.c new file mode 100644 index 00000000..1fcb80d7 --- /dev/null +++ b/src/spdk/lib/env_dpdk/pci_virtio.c @@ -0,0 +1,80 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "env_internal.h" + +#include "spdk/pci_ids.h" + +static struct rte_pci_id virtio_pci_driver_id[] = { + { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_SCSI_MODERN) }, + { RTE_PCI_DEVICE(SPDK_PCI_VID_VIRTIO, PCI_DEVICE_ID_VIRTIO_BLK_MODERN) }, + { .vendor_id = 0, /* sentinel */ }, +}; + +static struct spdk_pci_enum_ctx g_virtio_pci_drv = { + .driver = { + .drv_flags = RTE_PCI_DRV_NEED_MAPPING +#if RTE_VERSION >= RTE_VERSION_NUM(18, 8, 0, 0) + | RTE_PCI_DRV_WC_ACTIVATE +#endif + , + .id_table = virtio_pci_driver_id, +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 0) + .probe = spdk_pci_device_init, + .remove = spdk_pci_device_fini, + .driver.name = "spdk_virtio", +#else + .devinit = spdk_pci_device_init, + .devuninit = spdk_pci_device_fini, + .name = "spdk_virtio", +#endif + }, + + .cb_fn = NULL, + .cb_arg = NULL, + .mtx = PTHREAD_MUTEX_INITIALIZER, + .is_registered = false, +}; + +int +spdk_pci_virtio_device_attach(spdk_pci_enum_cb enum_cb, + void *enum_ctx, struct spdk_pci_addr *pci_address) +{ + return spdk_pci_device_attach(&g_virtio_pci_drv, enum_cb, enum_ctx, pci_address); +} + +int +spdk_pci_virtio_enumerate(spdk_pci_enum_cb enum_cb, void *enum_ctx) +{ + return spdk_pci_enumerate(&g_virtio_pci_drv, enum_cb, enum_ctx); +} diff --git a/src/spdk/lib/env_dpdk/threads.c b/src/spdk/lib/env_dpdk/threads.c new file mode 100644 index 00000000..55b0bbb6 --- /dev/null +++ b/src/spdk/lib/env_dpdk/threads.c @@ -0,0 +1,108 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" + +#include +#include + +uint32_t +spdk_env_get_core_count(void) +{ + return rte_lcore_count(); +} + +uint32_t +spdk_env_get_current_core(void) +{ + return rte_lcore_id(); +} + +uint32_t +spdk_env_get_first_core(void) +{ + return rte_get_next_lcore(-1, 0, 0); +} + +uint32_t +spdk_env_get_last_core(void) +{ + uint32_t i; + uint32_t last_core = UINT32_MAX; + + SPDK_ENV_FOREACH_CORE(i) { + last_core = i; + } + + assert(last_core != UINT32_MAX); + + return last_core; +} + +uint32_t +spdk_env_get_next_core(uint32_t prev_core) +{ + unsigned lcore; + + lcore = rte_get_next_lcore(prev_core, 0, 0); + if (lcore == RTE_MAX_LCORE) { + return UINT32_MAX; + } + return lcore; +} + +uint32_t +spdk_env_get_socket_id(uint32_t core) +{ + if (core >= RTE_MAX_LCORE) { + return SPDK_ENV_SOCKET_ID_ANY; + } + + return rte_lcore_to_socket_id(core); +} + +int +spdk_env_thread_launch_pinned(uint32_t core, thread_start_fn fn, void *arg) +{ + int rc; + + rc = rte_eal_remote_launch(fn, arg, core); + + return rc; +} + +void +spdk_env_thread_wait_all(void) +{ + rte_eal_mp_wait_lcore(); +} diff --git a/src/spdk/lib/env_dpdk/vtophys.c b/src/spdk/lib/env_dpdk/vtophys.c new file mode 100644 index 00000000..00e8bb6d --- /dev/null +++ b/src/spdk/lib/env_dpdk/vtophys.c @@ -0,0 +1,691 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "env_internal.h" + +#include +#include + +#include "spdk_internal/assert.h" + +#include "spdk/assert.h" +#include "spdk/likely.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +#ifdef __FreeBSD__ +#define SPDK_VFIO_ENABLED 0 +#else +#include +/* + * DPDK versions before 17.11 don't provide a way to get VFIO information in the public API, + * and we can't link to internal symbols when built against shared library DPDK, + * so disable VFIO entirely in that case. + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) && \ + (RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) || !defined(RTE_BUILD_SHARED_LIB)) + +#define SPDK_VFIO_ENABLED 1 +#include + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +#include +#else +/* Internal DPDK function forward declaration */ +int pci_vfio_is_enabled(void); +#endif + +struct spdk_vfio_dma_map { + struct vfio_iommu_type1_dma_map map; + struct vfio_iommu_type1_dma_unmap unmap; + TAILQ_ENTRY(spdk_vfio_dma_map) tailq; +}; + +struct vfio_cfg { + int fd; + bool enabled; + unsigned device_ref; + TAILQ_HEAD(, spdk_vfio_dma_map) maps; + pthread_mutex_t mutex; +}; + +static struct vfio_cfg g_vfio = { + .fd = -1, + .enabled = false, + .device_ref = 0, + .maps = TAILQ_HEAD_INITIALIZER(g_vfio.maps), + .mutex = PTHREAD_MUTEX_INITIALIZER +}; + +#else +#define SPDK_VFIO_ENABLED 0 +#endif +#endif + +#if DEBUG +#define DEBUG_PRINT(...) fprintf(stderr, __VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +struct spdk_vtophys_pci_device { + struct rte_pci_device *pci_device; + TAILQ_ENTRY(spdk_vtophys_pci_device) tailq; + uint64_t ref; +}; + +static pthread_mutex_t g_vtophys_pci_devices_mutex = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, spdk_vtophys_pci_device) g_vtophys_pci_devices = + TAILQ_HEAD_INITIALIZER(g_vtophys_pci_devices); + +static struct spdk_mem_map *g_vtophys_map; + +#if SPDK_VFIO_ENABLED +static int +vtophys_iommu_map_dma(uint64_t vaddr, uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + int ret; + + dma_map = calloc(1, sizeof(*dma_map)); + if (dma_map == NULL) { + return -ENOMEM; + } + + dma_map->map.argsz = sizeof(dma_map->map); + dma_map->map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + dma_map->map.vaddr = vaddr; + dma_map->map.iova = iova; + dma_map->map.size = size; + + dma_map->unmap.argsz = sizeof(dma_map->unmap); + dma_map->unmap.flags = 0; + dma_map->unmap.iova = iova; + dma_map->unmap.size = size; + + pthread_mutex_lock(&g_vfio.mutex); + if (g_vfio.device_ref == 0) { + /* VFIO requires at least one device (IOMMU group) to be added to + * a VFIO container before it is possible to perform any IOMMU + * operations on that container. This memory will be mapped once + * the first device (IOMMU group) is hotplugged. + * + * Since the vfio container is managed internally by DPDK, it is + * also possible that some device is already in that container, but + * it's not managed by SPDK - e.g. an NIC attached internally + * inside DPDK. We could map the memory straight away in such + * scenario, but there's no need to do it. DPDK devices clearly + * don't need our mappings and hence we defer the mapping + * unconditionally until the first SPDK-managed device is + * hotplugged. + */ + goto out_insert; + } + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot set up DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return ret; + } + +out_insert: + TAILQ_INSERT_TAIL(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + return 0; +} + +static int +vtophys_iommu_unmap_dma(uint64_t iova, uint64_t size) +{ + struct spdk_vfio_dma_map *dma_map; + int ret; + + pthread_mutex_lock(&g_vfio.mutex); + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + if (dma_map->map.iova == iova) { + break; + } + } + + if (dma_map == NULL) { + DEBUG_PRINT("Cannot clear DMA mapping for IOVA %"PRIx64" - it's not mapped\n", iova); + pthread_mutex_unlock(&g_vfio.mutex); + return -ENXIO; + } + + /** don't support partial or multiple-page unmap for now */ + assert(dma_map->map.size == size); + + if (g_vfio.device_ref == 0) { + /* Memory is not mapped anymore, just remove it's references */ + goto out_remove; + } + + + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot clear DMA mapping, error %d\n", errno); + pthread_mutex_unlock(&g_vfio.mutex); + return ret; + } + +out_remove: + TAILQ_REMOVE(&g_vfio.maps, dma_map, tailq); + pthread_mutex_unlock(&g_vfio.mutex); + free(dma_map); + return 0; +} +#endif + +static uint64_t +vtophys_get_paddr_memseg(uint64_t vaddr) +{ + uintptr_t paddr; + struct rte_memseg *seg; + +#if RTE_VERSION >= RTE_VERSION_NUM(18, 05, 0, 0) + seg = rte_mem_virt2memseg((void *)(uintptr_t)vaddr, NULL); + if (seg != NULL) { + paddr = seg->phys_addr; + if (paddr == RTE_BAD_IOVA) { + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } +#else + struct rte_mem_config *mcfg; + uint32_t seg_idx; + + mcfg = rte_eal_get_configuration()->mem_config; + for (seg_idx = 0; seg_idx < RTE_MAX_MEMSEG; seg_idx++) { + seg = &mcfg->memseg[seg_idx]; + if (seg->addr == NULL) { + break; + } + + if (vaddr >= (uintptr_t)seg->addr && + vaddr < ((uintptr_t)seg->addr + seg->len)) { + paddr = seg->phys_addr; +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + if (paddr == RTE_BAD_IOVA) { +#else + if (paddr == RTE_BAD_PHYS_ADDR) { +#endif + return SPDK_VTOPHYS_ERROR; + } + paddr += (vaddr - (uintptr_t)seg->addr); + return paddr; + } + } +#endif + + return SPDK_VTOPHYS_ERROR; +} + +/* Try to get the paddr from /proc/self/pagemap */ +static uint64_t +vtophys_get_paddr_pagemap(uint64_t vaddr) +{ + uintptr_t paddr; + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +#define BAD_ADDR RTE_BAD_IOVA +#define VTOPHYS rte_mem_virt2iova +#else +#define BAD_ADDR RTE_BAD_PHYS_ADDR +#define VTOPHYS rte_mem_virt2phy +#endif + + /* + * Note: the virt2phy/virt2iova functions have changed over time, such + * that older versions may return 0 while recent versions will never + * return 0 but RTE_BAD_PHYS_ADDR/IOVA instead. To support older and + * newer versions, check for both return values. + */ + paddr = VTOPHYS((void *)vaddr); + if (paddr == 0 || paddr == BAD_ADDR) { + /* + * The vaddr may be valid but doesn't have a backing page + * assigned yet. Touch the page to ensure a backing page + * gets assigned, then try to translate again. + */ + rte_atomic64_read((rte_atomic64_t *)vaddr); + paddr = VTOPHYS((void *)vaddr); + } + if (paddr == 0 || paddr == BAD_ADDR) { + /* Unable to get to the physical address. */ + return SPDK_VTOPHYS_ERROR; + } + +#undef BAD_ADDR +#undef VTOPHYS + + return paddr; +} + +/* Try to get the paddr from pci devices */ +static uint64_t +vtophys_get_paddr_pci(uint64_t vaddr) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + uintptr_t paddr; + struct rte_pci_device *dev; +#if RTE_VERSION >= RTE_VERSION_NUM(16, 11, 0, 1) + struct rte_mem_resource *res; +#else + struct rte_pci_resource *res; +#endif + unsigned r; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + dev = vtophys_dev->pci_device; + + for (r = 0; r < PCI_MAX_RESOURCE; r++) { + res = &dev->mem_resource[r]; + if (res->phys_addr && vaddr >= (uint64_t)res->addr && + vaddr < (uint64_t)res->addr + res->len) { + paddr = res->phys_addr + (vaddr - (uint64_t)res->addr); + DEBUG_PRINT("%s: %p -> %p\n", __func__, (void *)vaddr, + (void *)paddr); + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + return paddr; + } + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + + return SPDK_VTOPHYS_ERROR; +} + +static int +spdk_vtophys_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t len) +{ + int rc = 0, pci_phys = 0; + uint64_t paddr; + + if ((uintptr_t)vaddr & ~MASK_256TB) { + DEBUG_PRINT("invalid usermode virtual address %p\n", vaddr); + return -EINVAL; + } + + if (((uintptr_t)vaddr & MASK_2MB) || (len & MASK_2MB)) { + DEBUG_PRINT("invalid %s parameters, vaddr=%p len=%ju\n", + __func__, vaddr, len); + return -EINVAL; + } + + while (len > 0) { + /* Get the physical address from the DPDK memsegs */ + paddr = vtophys_get_paddr_memseg((uint64_t)vaddr); + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (paddr == SPDK_VTOPHYS_ERROR) { + /* This is not an address that DPDK is managing. */ +#if SPDK_VFIO_ENABLED + if (g_vfio.enabled) { + /* We'll use the virtual address as the iova. DPDK + * currently uses physical addresses as the iovas (or counts + * up from 0 if it can't get physical addresses), so + * the range of user space virtual addresses and physical + * addresses will never overlap. + */ + paddr = (uint64_t)vaddr; + rc = vtophys_iommu_map_dma((uint64_t)vaddr, paddr, VALUE_2MB); + if (rc) { + return -EFAULT; + } + } else +#endif + { + /* Get the physical address from /proc/self/pagemap. */ + paddr = vtophys_get_paddr_pagemap((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + /* Get the physical address from PCI devices */ + paddr = vtophys_get_paddr_pci((uint64_t)vaddr); + if (paddr == SPDK_VTOPHYS_ERROR) { + DEBUG_PRINT("could not get phys addr for %p\n", vaddr); + return -EFAULT; + } + pci_phys = 1; + } + } + } + /* Since PCI paddr can break the 2MiB physical alignment skip this check for that. */ + if (!pci_phys && (paddr & MASK_2MB)) { + DEBUG_PRINT("invalid paddr 0x%" PRIx64 " - must be 2MB aligned\n", paddr); + return -EINVAL; + } + + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, VALUE_2MB, paddr); + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: +#if SPDK_VFIO_ENABLED + if (paddr == SPDK_VTOPHYS_ERROR) { + /* + * This is not an address that DPDK is managing. If vfio is enabled, + * we need to unmap the range from the IOMMU + */ + if (g_vfio.enabled) { + uint64_t buffer_len; + paddr = spdk_mem_map_translate(map, (uint64_t)vaddr, &buffer_len); + if (buffer_len != VALUE_2MB) { + return -EINVAL; + } + rc = vtophys_iommu_unmap_dma(paddr, VALUE_2MB); + if (rc) { + return -EFAULT; + } + } + } +#endif + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, VALUE_2MB); + break; + default: + SPDK_UNREACHABLE(); + } + + if (rc != 0) { + return rc; + } + vaddr += VALUE_2MB; + len -= VALUE_2MB; + } + + return rc; +} + +#if SPDK_VFIO_ENABLED + +static bool +spdk_vfio_enabled(void) +{ +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + return rte_vfio_is_enabled("vfio_pci"); +#else + return pci_vfio_is_enabled(); +#endif +} + +static void +spdk_vtophys_iommu_init(void) +{ + char proc_fd_path[PATH_MAX + 1]; + char link_path[PATH_MAX + 1]; + const char vfio_path[] = "/dev/vfio/vfio"; + DIR *dir; + struct dirent *d; + + if (!spdk_vfio_enabled()) { + return; + } + + dir = opendir("/proc/self/fd"); + if (!dir) { + DEBUG_PRINT("Failed to open /proc/self/fd (%d)\n", errno); + return; + } + + while ((d = readdir(dir)) != NULL) { + if (d->d_type != DT_LNK) { + continue; + } + + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%s", d->d_name); + if (readlink(proc_fd_path, link_path, sizeof(link_path)) != (sizeof(vfio_path) - 1)) { + continue; + } + + if (memcmp(link_path, vfio_path, sizeof(vfio_path) - 1) == 0) { + sscanf(d->d_name, "%d", &g_vfio.fd); + break; + } + } + + closedir(dir); + + if (g_vfio.fd < 0) { + DEBUG_PRINT("Failed to discover DPDK VFIO container fd.\n"); + return; + } + + g_vfio.enabled = true; + + return; +} +#endif + +void +spdk_vtophys_pci_device_added(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + bool found = false; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + vtophys_dev->ref++; + found = true; + break; + } + } + + if (!found) { + vtophys_dev = calloc(1, sizeof(*vtophys_dev)); + if (vtophys_dev) { + vtophys_dev->pci_device = pci_device; + vtophys_dev->ref = 1; + TAILQ_INSERT_TAIL(&g_vtophys_pci_devices, vtophys_dev, tailq); + } else { + DEBUG_PRINT("Memory allocation error\n"); + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if SPDK_VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + g_vfio.device_ref++; + if (g_vfio.device_ref > 1) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the first SPDK device using DPDK vfio. This means that the first + * IOMMU group might have been just been added to the DPDK vfio container. + * From this point it is certain that the memory can be mapped now. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_MAP_DMA, &dma_map->map); + if (ret) { + DEBUG_PRINT("Cannot update DMA mapping, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +void +spdk_vtophys_pci_device_removed(struct rte_pci_device *pci_device) +{ + struct spdk_vtophys_pci_device *vtophys_dev; + + pthread_mutex_lock(&g_vtophys_pci_devices_mutex); + TAILQ_FOREACH(vtophys_dev, &g_vtophys_pci_devices, tailq) { + if (vtophys_dev->pci_device == pci_device) { + assert(vtophys_dev->ref > 0); + if (--vtophys_dev->ref == 0) { + TAILQ_REMOVE(&g_vtophys_pci_devices, vtophys_dev, tailq); + free(vtophys_dev); + } + break; + } + } + pthread_mutex_unlock(&g_vtophys_pci_devices_mutex); + +#if SPDK_VFIO_ENABLED + struct spdk_vfio_dma_map *dma_map; + int ret; + + if (!g_vfio.enabled) { + return; + } + + pthread_mutex_lock(&g_vfio.mutex); + assert(g_vfio.device_ref > 0); + g_vfio.device_ref--; + if (g_vfio.device_ref > 0) { + pthread_mutex_unlock(&g_vfio.mutex); + return; + } + + /* This is the last SPDK device using DPDK vfio. If DPDK doesn't have + * any additional devices using it's vfio container, all the mappings + * will be automatically removed by the Linux vfio driver. We unmap + * the memory manually to be able to easily re-map it later regardless + * of other, external factors. + */ + TAILQ_FOREACH(dma_map, &g_vfio.maps, tailq) { + ret = ioctl(g_vfio.fd, VFIO_IOMMU_UNMAP_DMA, &dma_map->unmap); + if (ret) { + DEBUG_PRINT("Cannot unmap DMA memory, error %d\n", errno); + break; + } + } + pthread_mutex_unlock(&g_vfio.mutex); +#endif +} + +int +spdk_vtophys_init(void) +{ + const struct spdk_mem_map_ops vtophys_map_ops = { + .notify_cb = spdk_vtophys_notify, + .are_contiguous = NULL + }; + +#if SPDK_VFIO_ENABLED + spdk_vtophys_iommu_init(); +#endif + + g_vtophys_map = spdk_mem_map_alloc(SPDK_VTOPHYS_ERROR, &vtophys_map_ops, NULL); + if (g_vtophys_map == NULL) { + DEBUG_PRINT("vtophys map allocation failed\n"); + return -1; + } + return 0; +} + +uint64_t +spdk_vtophys(void *buf) +{ + uint64_t vaddr, paddr_2mb; + + vaddr = (uint64_t)buf; + + paddr_2mb = spdk_mem_map_translate(g_vtophys_map, vaddr, NULL); + + /* + * SPDK_VTOPHYS_ERROR has all bits set, so if the lookup returned SPDK_VTOPHYS_ERROR, + * we will still bitwise-or it with the buf offset below, but the result will still be + * SPDK_VTOPHYS_ERROR. However now that we do + rather than | (due to PCI vtophys being + * unaligned) we must now check the return value before addition. + */ + SPDK_STATIC_ASSERT(SPDK_VTOPHYS_ERROR == UINT64_C(-1), "SPDK_VTOPHYS_ERROR should be all 1s"); + if (paddr_2mb == SPDK_VTOPHYS_ERROR) { + return SPDK_VTOPHYS_ERROR; + } else { + return paddr_2mb + ((uint64_t)buf & MASK_2MB); + } +} + +static int +spdk_bus_scan(void) +{ + return 0; +} + +static int +spdk_bus_probe(void) +{ + return 0; +} + +static struct rte_device * +spdk_bus_find_device(const struct rte_device *start, + rte_dev_cmp_t cmp, const void *data) +{ + return NULL; +} + +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) +static enum rte_iova_mode +spdk_bus_get_iommu_class(void) { + /* Since we register our PCI drivers after EAL init, we have no chance + * of switching into RTE_IOVA_VA (virtual addresses as iova) iommu + * class. DPDK uses RTE_IOVA_PA by default because for some platforms + * it's the only supported mode, but then SPDK does not support those + * platforms and doesn't mind defaulting to RTE_IOVA_VA. The rte_pci bus + * will force RTE_IOVA_PA if RTE_IOVA_VA simply can not be used + * (i.e. at least one device on the system is bound to uio_pci_generic), + * so we simply return RTE_IOVA_VA here. + */ + return RTE_IOVA_VA; +} +#endif + +struct rte_bus spdk_bus = { + .scan = spdk_bus_scan, + .probe = spdk_bus_probe, + .find_device = spdk_bus_find_device, +#if RTE_VERSION >= RTE_VERSION_NUM(17, 11, 0, 3) + .get_iommu_class = spdk_bus_get_iommu_class, +#endif +}; + +RTE_REGISTER_BUS(spdk, spdk_bus); diff --git a/src/spdk/lib/event/Makefile b/src/spdk/lib/event/Makefile new file mode 100644 index 00000000..659b85e9 --- /dev/null +++ b/src/spdk/lib/event/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = event +C_SRCS = app.c reactor.c rpc.c subsystem.c + +DIRS-y = rpc subsystems + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/app.c b/src/spdk/lib/event/app.c new file mode 100644 index 00000000..012e2920 --- /dev/null +++ b/src/spdk/lib/event/app.c @@ -0,0 +1,998 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/event.h" + +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/string.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#define SPDK_APP_DEFAULT_LOG_LEVEL SPDK_LOG_NOTICE +#define SPDK_APP_DEFAULT_LOG_PRINT_LEVEL SPDK_LOG_INFO +#define SPDK_APP_DEFAULT_BACKTRACE_LOG_LEVEL SPDK_LOG_ERROR + +#define SPDK_APP_DPDK_DEFAULT_MEM_SIZE -1 +#define SPDK_APP_DPDK_DEFAULT_MASTER_CORE -1 +#define SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL -1 +#define SPDK_APP_DPDK_DEFAULT_CORE_MASK "0x1" + +struct spdk_app { + struct spdk_conf *config; + int shm_id; + spdk_app_shutdown_cb shutdown_cb; + int rc; +}; + +static struct spdk_app g_spdk_app; +static struct spdk_event *g_app_start_event = NULL; +static struct spdk_event *g_shutdown_event = NULL; +static int g_init_lcore; +static bool g_delay_subsystem_init = false; +static bool g_shutdown_sig_received = false; +static char *g_executable_name; +static struct spdk_app_opts g_default_opts; + +int +spdk_app_get_shm_id(void) +{ + return g_spdk_app.shm_id; +} + +/* append one empty option to indicate the end of the array */ +static const struct option g_cmdline_options[] = { +#define CONFIG_FILE_OPT_IDX 'c' + {"config", required_argument, NULL, CONFIG_FILE_OPT_IDX}, +#define LIMIT_COREDUMP_OPT_IDX 'd' + {"limit-coredump", no_argument, NULL, LIMIT_COREDUMP_OPT_IDX}, +#define TPOINT_GROUP_MASK_OPT_IDX 'e' + {"tpoint-group-mask", required_argument, NULL, TPOINT_GROUP_MASK_OPT_IDX}, +#define SINGLE_FILE_SEGMENTS_OPT_IDX 'g' + {"single-file-segments", no_argument, NULL, SINGLE_FILE_SEGMENTS_OPT_IDX}, +#define HELP_OPT_IDX 'h' + {"help", no_argument, NULL, HELP_OPT_IDX}, +#define SHM_ID_OPT_IDX 'i' + {"shm-id", required_argument, NULL, SHM_ID_OPT_IDX}, +#define CPUMASK_OPT_IDX 'm' + {"cpumask", required_argument, NULL, CPUMASK_OPT_IDX}, +#define MEM_CHANNELS_OPT_IDX 'n' + {"mem-channels", required_argument, NULL, MEM_CHANNELS_OPT_IDX}, +#define MASTER_CORE_OPT_IDX 'p' + {"master-core", required_argument, NULL, MASTER_CORE_OPT_IDX}, +#define RPC_SOCKET_OPT_IDX 'r' + {"rpc-socket", required_argument, NULL, RPC_SOCKET_OPT_IDX}, +#define MEM_SIZE_OPT_IDX 's' + {"mem-size", required_argument, NULL, MEM_SIZE_OPT_IDX}, +#define NO_PCI_OPT_IDX 'u' + {"no-pci", no_argument, NULL, NO_PCI_OPT_IDX}, +#define PCI_BLACKLIST_OPT_IDX 'B' + {"pci-blacklist", required_argument, NULL, PCI_BLACKLIST_OPT_IDX}, +#define TRACEFLAG_OPT_IDX 'L' + {"traceflag", required_argument, NULL, TRACEFLAG_OPT_IDX}, +#define HUGE_UNLINK_OPT_IDX 'R' + {"huge-unlink", no_argument, NULL, HUGE_UNLINK_OPT_IDX}, +#define PCI_WHITELIST_OPT_IDX 'W' + {"pci-whitelist", required_argument, NULL, PCI_WHITELIST_OPT_IDX}, +#define SILENCE_NOTICELOG_OPT_IDX 257 + {"silence-noticelog", no_argument, NULL, SILENCE_NOTICELOG_OPT_IDX}, +#define WAIT_FOR_RPC_OPT_IDX 258 + {"wait-for-rpc", no_argument, NULL, WAIT_FOR_RPC_OPT_IDX}, +}; + +/* Global section */ +#define GLOBAL_CONFIG_TMPL \ +"# Configuration file\n" \ +"#\n" \ +"# Please write all parameters using ASCII.\n" \ +"# The parameter must be quoted if it includes whitespace.\n" \ +"#\n" \ +"# Configuration syntax:\n" \ +"# Spaces at head of line are deleted, other spaces are as separator\n" \ +"# Lines starting with '#' are comments and not evaluated.\n" \ +"# Lines ending with '\\' are concatenated with the next line.\n" \ +"# Bracketed keys are section keys grouping the following value keys.\n" \ +"# Number of section key is used as a tag number.\n" \ +"# Ex. [TargetNode1] = TargetNode section key with tag number 1\n" \ +"[Global]\n" \ +" Comment \"Global section\"\n" \ +"\n" \ +" # Users can restrict work items to only run on certain cores by\n" \ +" # specifying a ReactorMask. Default is to allow work items to run\n" \ +" # on all cores. Core 0 must be set in the mask if one is specified.\n" \ +" # Default: 0xFFFF (cores 0-15)\n" \ +" ReactorMask \"0x%s\"\n" \ +"\n" \ +" # Tracepoint group mask for spdk trace buffers\n" \ +" # Default: 0x0 (all tracepoint groups disabled)\n" \ +" # Set to 0xFFFF to enable all tracepoint groups.\n" \ +" TpointGroupMask \"0x%" PRIX64 "\"\n" \ +"\n" \ + +static void +spdk_app_config_dump_global_section(FILE *fp) +{ + struct spdk_cpuset *coremask; + + if (NULL == fp) { + return; + } + + coremask = spdk_app_get_core_mask(); + + fprintf(fp, GLOBAL_CONFIG_TMPL, spdk_cpuset_fmt(coremask), + spdk_trace_get_tpoint_group_mask()); +} + +int +spdk_app_get_running_config(char **config_str, char *name) +{ + FILE *fp = NULL; + int fd = -1; + long length = 0, ret = 0; + char vbuf[BUFSIZ]; + char config_template[64]; + + snprintf(config_template, sizeof(config_template), "/tmp/%s.XXXXXX", name); + /* Create temporary file to hold config */ + fd = mkstemp(config_template); + if (fd == -1) { + SPDK_ERRLOG("mkstemp failed\n"); + return -1; + } + fp = fdopen(fd, "wb+"); + if (NULL == fp) { + SPDK_ERRLOG("error opening tmpfile fd = %d\n", fd); + return -1; + } + + /* Buffered IO */ + setvbuf(fp, vbuf, _IOFBF, BUFSIZ); + + spdk_app_config_dump_global_section(fp); + spdk_subsystem_config(fp); + + length = ftell(fp); + + *config_str = malloc(length + 1); + if (!*config_str) { + SPDK_ERRLOG("out-of-memory for config\n"); + fclose(fp); + return -1; + } + fseek(fp, 0, SEEK_SET); + ret = fread(*config_str, sizeof(char), length, fp); + if (ret < length) { + SPDK_ERRLOG("short read\n"); + } + fclose(fp); + (*config_str)[length] = '\0'; + + return 0; +} + +void +spdk_app_start_shutdown(void) +{ + if (g_shutdown_event != NULL) { + spdk_event_call(g_shutdown_event); + g_shutdown_event = NULL; + } else { + spdk_app_stop(0); + } +} + +static void +__shutdown_signal(int signo) +{ + if (!g_shutdown_sig_received) { + g_shutdown_sig_received = true; + spdk_app_start_shutdown(); + } +} + +static void +__shutdown_event_cb(void *arg1, void *arg2) +{ + g_spdk_app.shutdown_cb(); +} + +static int +spdk_app_opts_validate(const char *app_opts) +{ + int i = 0, j; + + for (i = 0; app_opts[i] != '\0'; i++) { + /* ignore getopt control characters */ + if (app_opts[i] == ':' || app_opts[i] == '+' || app_opts[i] == '-') { + continue; + } + + for (j = 0; SPDK_APP_GETOPT_STRING[j] != '\0'; j++) { + if (app_opts[i] == SPDK_APP_GETOPT_STRING[j]) { + return app_opts[i]; + } + } + } + return 0; +} + +void +spdk_app_opts_init(struct spdk_app_opts *opts) +{ + if (!opts) { + return; + } + + memset(opts, 0, sizeof(*opts)); + + opts->enable_coredump = true; + opts->shm_id = -1; + opts->mem_size = SPDK_APP_DPDK_DEFAULT_MEM_SIZE; + opts->master_core = SPDK_APP_DPDK_DEFAULT_MASTER_CORE; + opts->mem_channel = SPDK_APP_DPDK_DEFAULT_MEM_CHANNEL; + opts->reactor_mask = NULL; + opts->max_delay_us = 0; + opts->print_level = SPDK_APP_DEFAULT_LOG_PRINT_LEVEL; + opts->rpc_addr = SPDK_DEFAULT_RPC_ADDR; + opts->delay_subsystem_init = false; +} + +static int +spdk_app_setup_signal_handlers(struct spdk_app_opts *opts) +{ + struct sigaction sigact; + sigset_t sigmask; + int rc; + + /* Set up custom shutdown handling if the user requested it. */ + if (opts->shutdown_cb != NULL) { + g_shutdown_event = spdk_event_allocate(spdk_env_get_current_core(), + __shutdown_event_cb, + NULL, NULL); + } + + sigemptyset(&sigmask); + memset(&sigact, 0, sizeof(sigact)); + sigemptyset(&sigact.sa_mask); + + sigact.sa_handler = SIG_IGN; + rc = sigaction(SIGPIPE, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGPIPE) failed\n"); + return rc; + } + + /* Install the same handler for SIGINT and SIGTERM */ + sigact.sa_handler = __shutdown_signal; + + rc = sigaction(SIGINT, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGINT) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGINT); + + rc = sigaction(SIGTERM, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGTERM) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGTERM); + + if (opts->usr1_handler != NULL) { + sigact.sa_handler = opts->usr1_handler; + rc = sigaction(SIGUSR1, &sigact, NULL); + if (rc < 0) { + SPDK_ERRLOG("sigaction(SIGUSR1) failed\n"); + return rc; + } + sigaddset(&sigmask, SIGUSR1); + } + + pthread_sigmask(SIG_UNBLOCK, &sigmask, NULL); + + return 0; +} + +static void +spdk_app_start_application(void) +{ + spdk_rpc_set_state(SPDK_RPC_RUNTIME); + spdk_event_call(g_app_start_event); +} + +static void +spdk_app_start_rpc(void *arg1, void *arg2) +{ + const char *rpc_addr = arg1; + + spdk_rpc_initialize(rpc_addr); + if (!g_delay_subsystem_init) { + spdk_app_start_application(); + } +} + +static struct spdk_conf * +spdk_app_setup_conf(const char *config_file) +{ + struct spdk_conf *config; + int rc; + + config = spdk_conf_allocate(); + assert(config != NULL); + if (config_file) { + rc = spdk_conf_read(config, config_file); + if (rc != 0) { + SPDK_ERRLOG("Could not read config file %s\n", config_file); + goto error; + } + if (spdk_conf_first_section(config) == NULL) { + SPDK_ERRLOG("Invalid config file %s\n", config_file); + goto error; + } + } + spdk_conf_set_as_default(config); + return config; + +error: + spdk_conf_free(config); + return NULL; +} + +static int +spdk_app_opts_add_pci_addr(struct spdk_app_opts *opts, struct spdk_pci_addr **list, char *bdf) +{ + struct spdk_pci_addr *tmp = *list; + size_t i = opts->num_pci_addr; + + tmp = realloc(tmp, sizeof(*tmp) * (i + 1)); + if (tmp == NULL) { + SPDK_ERRLOG("realloc error\n"); + return -ENOMEM; + } + + *list = tmp; + if (spdk_pci_addr_parse(*list + i, bdf) < 0) { + SPDK_ERRLOG("Invalid address %s\n", bdf); + return -EINVAL; + } + + opts->num_pci_addr++; + return 0; +} + +static int +spdk_app_read_config_file_global_params(struct spdk_app_opts *opts) +{ + struct spdk_conf_section *sp; + char *bdf; + int i, rc = 0; + + sp = spdk_conf_find_section(NULL, "Global"); + + if (opts->shm_id == -1) { + if (sp != NULL) { + opts->shm_id = spdk_conf_section_get_intval(sp, "SharedMemoryID"); + } + } + + if (opts->reactor_mask == NULL) { + if (sp && spdk_conf_section_get_val(sp, "ReactorMask")) { + SPDK_ERRLOG("ReactorMask config option is deprecated. Use -m/--cpumask\n" + "command line parameter instead.\n"); + opts->reactor_mask = spdk_conf_section_get_val(sp, "ReactorMask"); + } else { + opts->reactor_mask = SPDK_APP_DPDK_DEFAULT_CORE_MASK; + } + } + + if (!opts->no_pci && sp) { + opts->no_pci = spdk_conf_section_get_boolval(sp, "NoPci", false); + } + + if (opts->tpoint_group_mask == NULL) { + if (sp != NULL) { + opts->tpoint_group_mask = spdk_conf_section_get_val(sp, "TpointGroupMask"); + } + } + + if (sp == NULL) { + return 0; + } + + for (i = 0; ; i++) { + bdf = spdk_conf_section_get_nmval(sp, "PciBlacklist", i, 0); + if (!bdf) { + break; + } + + rc = spdk_app_opts_add_pci_addr(opts, &opts->pci_blacklist, bdf); + if (rc != 0) { + free(opts->pci_blacklist); + return rc; + } + } + + for (i = 0; ; i++) { + bdf = spdk_conf_section_get_nmval(sp, "PciWhitelist", i, 0); + if (!bdf) { + break; + } + + if (opts->pci_blacklist != NULL) { + SPDK_ERRLOG("PciBlacklist and PciWhitelist cannot be used at the same time\n"); + free(opts->pci_blacklist); + return -EINVAL; + } + + rc = spdk_app_opts_add_pci_addr(opts, &opts->pci_whitelist, bdf); + if (rc != 0) { + free(opts->pci_whitelist); + return rc; + } + } + return 0; +} + +static int +spdk_app_setup_env(struct spdk_app_opts *opts) +{ + struct spdk_env_opts env_opts = {}; + int rc; + + spdk_env_opts_init(&env_opts); + + env_opts.name = opts->name; + env_opts.core_mask = opts->reactor_mask; + env_opts.shm_id = opts->shm_id; + env_opts.mem_channel = opts->mem_channel; + env_opts.master_core = opts->master_core; + env_opts.mem_size = opts->mem_size; + env_opts.hugepage_single_segments = opts->hugepage_single_segments; + env_opts.unlink_hugepage = opts->unlink_hugepage; + env_opts.no_pci = opts->no_pci; + env_opts.num_pci_addr = opts->num_pci_addr; + env_opts.pci_blacklist = opts->pci_blacklist; + env_opts.pci_whitelist = opts->pci_whitelist; + + rc = spdk_env_init(&env_opts); + free(env_opts.pci_blacklist); + free(env_opts.pci_whitelist); + + if (rc < 0) { + fprintf(stderr, "Unable to initialize SPDK env\n"); + } + + return rc; +} + +static int +spdk_app_setup_trace(struct spdk_app_opts *opts) +{ + char shm_name[64]; + uint64_t tpoint_group_mask; + char *end; + + if (opts->shm_id >= 0) { + snprintf(shm_name, sizeof(shm_name), "/%s_trace.%d", opts->name, opts->shm_id); + } else { + snprintf(shm_name, sizeof(shm_name), "/%s_trace.pid%d", opts->name, (int)getpid()); + } + + if (spdk_trace_init(shm_name) != 0) { + return -1; + } + + if (opts->tpoint_group_mask != NULL) { + errno = 0; + tpoint_group_mask = strtoull(opts->tpoint_group_mask, &end, 16); + if (*end != '\0' || errno) { + SPDK_ERRLOG("invalid tpoint mask %s\n", opts->tpoint_group_mask); + } else { + SPDK_NOTICELOG("Tracepoint Group Mask %s specified.\n", opts->tpoint_group_mask); + SPDK_NOTICELOG("Use 'spdk_trace -s %s %s %d' to capture a snapshot of events at runtime.\n", + opts->name, + opts->shm_id >= 0 ? "-i" : "-p", + opts->shm_id >= 0 ? opts->shm_id : getpid()); +#if defined(__linux__) + SPDK_NOTICELOG("Or copy /dev/shm%s for offline analysis/debug.\n", shm_name); +#endif + spdk_trace_set_tpoint_group_mask(tpoint_group_mask); + } + } + + return 0; +} + +int +spdk_app_start(struct spdk_app_opts *opts, spdk_event_fn start_fn, + void *arg1, void *arg2) +{ + struct spdk_conf *config = NULL; + int rc; + struct spdk_event *rpc_start_event; + char *tty; + + if (!opts) { + SPDK_ERRLOG("opts should not be NULL\n"); + return 1; + } + + if (!start_fn) { + SPDK_ERRLOG("start_fn should not be NULL\n"); + return 1; + } + + tty = ttyname(STDERR_FILENO); + if (opts->print_level > SPDK_LOG_WARN && + isatty(STDERR_FILENO) && + tty && + !strncmp(tty, "/dev/tty", strlen("/dev/tty"))) { + printf("Warning: printing stderr to console terminal without -q option specified.\n"); + printf("Suggest using --silence-noticelog to disable logging to stderr and\n"); + printf("monitor syslog, or redirect stderr to a file.\n"); + printf("(Delaying for 10 seconds...)\n"); + sleep(10); + } + + spdk_log_set_print_level(opts->print_level); + +#ifndef SPDK_NO_RLIMIT + if (opts->enable_coredump) { + struct rlimit core_limits; + + core_limits.rlim_cur = core_limits.rlim_max = RLIM_INFINITY; + setrlimit(RLIMIT_CORE, &core_limits); + } +#endif + + config = spdk_app_setup_conf(opts->config_file); + if (config == NULL) { + goto app_start_setup_conf_err; + } + + if (spdk_app_read_config_file_global_params(opts) < 0) { + goto app_start_setup_conf_err; + } + + spdk_log_set_level(SPDK_APP_DEFAULT_LOG_LEVEL); + spdk_log_set_backtrace_level(SPDK_APP_DEFAULT_BACKTRACE_LOG_LEVEL); + + if (spdk_app_setup_env(opts) < 0) { + goto app_start_setup_conf_err; + } + + spdk_log_open(); + SPDK_NOTICELOG("Total cores available: %d\n", spdk_env_get_core_count()); + + spdk_thread_lib_init(); + + /* + * If mask not specified on command line or in configuration file, + * reactor_mask will be 0x1 which will enable core 0 to run one + * reactor. + */ + if ((rc = spdk_reactors_init(opts->max_delay_us)) != 0) { + SPDK_ERRLOG("Invalid reactor mask.\n"); + goto app_start_log_close_err; + } + + /* + * Note the call to spdk_app_setup_trace() is located here + * ahead of spdk_app_setup_signal_handlers(). + * That's because there is not an easy/direct clean + * way of unwinding alloc'd resources that can occur + * in spdk_app_setup_signal_handlers(). + */ + if (spdk_app_setup_trace(opts) != 0) { + goto app_start_log_close_err; + } + + if ((rc = spdk_app_setup_signal_handlers(opts)) != 0) { + goto app_start_trace_cleanup_err; + } + + memset(&g_spdk_app, 0, sizeof(g_spdk_app)); + g_spdk_app.config = config; + g_spdk_app.shm_id = opts->shm_id; + g_spdk_app.shutdown_cb = opts->shutdown_cb; + g_spdk_app.rc = 0; + g_init_lcore = spdk_env_get_current_core(); + g_delay_subsystem_init = opts->delay_subsystem_init; + g_app_start_event = spdk_event_allocate(g_init_lcore, start_fn, arg1, arg2); + + rpc_start_event = spdk_event_allocate(g_init_lcore, spdk_app_start_rpc, + (void *)opts->rpc_addr, NULL); + + if (!g_delay_subsystem_init) { + spdk_subsystem_init(rpc_start_event); + } else { + spdk_event_call(rpc_start_event); + } + + /* This blocks until spdk_app_stop is called */ + spdk_reactors_start(); + + return g_spdk_app.rc; + +app_start_trace_cleanup_err: + spdk_trace_cleanup(); + +app_start_log_close_err: + spdk_log_close(); + +app_start_setup_conf_err: + return 1; +} + +void +spdk_app_fini(void) +{ + spdk_trace_cleanup(); + spdk_reactors_fini(); + spdk_conf_free(g_spdk_app.config); + spdk_log_close(); + spdk_thread_lib_fini(); +} + +static void +_spdk_app_stop(void *arg1, void *arg2) +{ + struct spdk_event *app_stop_event; + + spdk_rpc_finish(); + + app_stop_event = spdk_event_allocate(spdk_env_get_current_core(), spdk_reactors_stop, NULL, NULL); + spdk_subsystem_fini(app_stop_event); +} + +void +spdk_app_stop(int rc) +{ + if (rc) { + SPDK_WARNLOG("spdk_app_stop'd on non-zero\n"); + } + g_spdk_app.rc = rc; + /* + * We want to run spdk_subsystem_fini() from the same lcore where spdk_subsystem_init() + * was called. + */ + spdk_event_call(spdk_event_allocate(g_init_lcore, _spdk_app_stop, NULL, NULL)); +} + +static void +usage(void (*app_usage)(void)) +{ + printf("%s [options]\n", g_executable_name); + printf("options:\n"); + printf(" -c, --config config file (default %s)\n", g_default_opts.config_file); + printf(" -d, --limit-coredump do not set max coredump size to RLIM_INFINITY\n"); + printf(" -e, --tpoint-group-mask \n"); + printf(" tracepoint group mask for spdk trace buffers (default 0x0)\n"); + printf(" -g, --single-file-segments\n"); + printf(" force creating just one hugetlbfs file\n"); + printf(" -h, --help show this usage\n"); + printf(" -i, --shm-id shared memory ID (optional)\n"); + printf(" -m, --cpumask core mask for DPDK\n"); + printf(" -n, --mem-channels channel number of memory channels used for DPDK\n"); + printf(" -p, --master-core master (primary) core for DPDK\n"); + printf(" -r, --rpc-socket RPC listen address (default %s)\n", SPDK_DEFAULT_RPC_ADDR); + printf(" -s, --mem-size memory size in MB for DPDK (default: "); + if (g_default_opts.mem_size > 0) { + printf("%dMB)\n", g_default_opts.mem_size); + } else { + printf("all hugepage memory)\n"); + } + printf(" --silence-noticelog disable notice level logging to stderr\n"); + printf(" -u, --no-pci disable PCI access\n"); + printf(" --wait-for-rpc wait for RPCs to initialize subsystems\n"); + printf(" -B, --pci-blacklist \n"); + printf(" pci addr to blacklist (can be used more than once)\n"); + printf(" -R, --huge-unlink unlink huge files after initialization\n"); + printf(" -W, --pci-whitelist \n"); + printf(" pci addr to whitelist (-B and -W cannot be used at the same time)\n"); + spdk_tracelog_usage(stdout, "-L"); + if (app_usage) { + app_usage(); + } +} + +spdk_app_parse_args_rvals_t +spdk_app_parse_args(int argc, char **argv, struct spdk_app_opts *opts, + const char *app_getopt_str, struct option *app_long_opts, + void (*app_parse)(int ch, char *arg), + void (*app_usage)(void)) +{ + int ch, rc, opt_idx, global_long_opts_len, app_long_opts_len; + struct option *cmdline_options; + char *cmdline_short_opts = NULL; + enum spdk_app_parse_args_rvals retval = SPDK_APP_PARSE_ARGS_FAIL; + + memcpy(&g_default_opts, opts, sizeof(g_default_opts)); + + if (opts->config_file && access(opts->config_file, F_OK) != 0) { + opts->config_file = NULL; + } + + if (app_long_opts == NULL) { + app_long_opts_len = 0; + } else { + for (app_long_opts_len = 0; + app_long_opts[app_long_opts_len].name != NULL; + app_long_opts_len++); + } + + global_long_opts_len = SPDK_COUNTOF(g_cmdline_options); + + cmdline_options = calloc(global_long_opts_len + app_long_opts_len + 1, sizeof(*cmdline_options)); + if (!cmdline_options) { + fprintf(stderr, "Out of memory\n"); + return SPDK_APP_PARSE_ARGS_FAIL; + } + + memcpy(&cmdline_options[0], g_cmdline_options, sizeof(g_cmdline_options)); + if (app_long_opts) { + memcpy(&cmdline_options[global_long_opts_len], app_long_opts, + app_long_opts_len * sizeof(*app_long_opts)); + } + + if (app_getopt_str != NULL) { + ch = spdk_app_opts_validate(app_getopt_str); + if (ch) { + fprintf(stderr, "Duplicated option '%c' between the generic and application specific spdk opts.\n", + ch); + goto out; + } + } + + cmdline_short_opts = spdk_sprintf_alloc("%s%s", app_getopt_str, SPDK_APP_GETOPT_STRING); + if (!cmdline_short_opts) { + fprintf(stderr, "Out of memory\n"); + goto out; + } + + g_executable_name = argv[0]; + + while ((ch = getopt_long(argc, argv, cmdline_short_opts, cmdline_options, &opt_idx)) != -1) { + switch (ch) { + case CONFIG_FILE_OPT_IDX: + opts->config_file = optarg; + break; + case LIMIT_COREDUMP_OPT_IDX: + opts->enable_coredump = false; + break; + case TPOINT_GROUP_MASK_OPT_IDX: + opts->tpoint_group_mask = optarg; + break; + case SINGLE_FILE_SEGMENTS_OPT_IDX: + opts->hugepage_single_segments = true; + break; + case HELP_OPT_IDX: + usage(app_usage); + retval = SPDK_APP_PARSE_ARGS_HELP; + goto out; + case SHM_ID_OPT_IDX: + if (optarg == NULL) { + goto out; + } + opts->shm_id = atoi(optarg); + break; + case CPUMASK_OPT_IDX: + opts->reactor_mask = optarg; + break; + case MEM_CHANNELS_OPT_IDX: + if (optarg == NULL) { + goto out; + } + opts->mem_channel = atoi(optarg); + break; + case MASTER_CORE_OPT_IDX: + if (optarg == NULL) { + goto out; + } + opts->master_core = atoi(optarg); + break; + case SILENCE_NOTICELOG_OPT_IDX: + opts->print_level = SPDK_LOG_WARN; + break; + case RPC_SOCKET_OPT_IDX: + opts->rpc_addr = optarg; + break; + case MEM_SIZE_OPT_IDX: { + uint64_t mem_size_mb; + bool mem_size_has_prefix; + + rc = spdk_parse_capacity(optarg, &mem_size_mb, &mem_size_has_prefix); + if (rc != 0) { + fprintf(stderr, "invalid memory pool size `-s %s`\n", optarg); + usage(app_usage); + goto out; + } + + if (mem_size_has_prefix) { + /* the mem size is in MB by default, so if a prefix was + * specified, we need to manually convert to MB. + */ + mem_size_mb /= 1024 * 1024; + } + + if (mem_size_mb > INT_MAX) { + fprintf(stderr, "invalid memory pool size `-s %s`\n", optarg); + usage(app_usage); + goto out; + } + + opts->mem_size = (int) mem_size_mb; + break; + } + case NO_PCI_OPT_IDX: + opts->no_pci = true; + break; + case WAIT_FOR_RPC_OPT_IDX: + opts->delay_subsystem_init = true; + break; + case PCI_BLACKLIST_OPT_IDX: + if (opts->pci_whitelist) { + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + fprintf(stderr, "-B and -W cannot be used at the same time\n"); + usage(app_usage); + goto out; + } + + rc = spdk_app_opts_add_pci_addr(opts, &opts->pci_blacklist, optarg); + if (rc != 0) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + goto out; + } + break; + case TRACEFLAG_OPT_IDX: +#ifndef DEBUG + fprintf(stderr, "%s must be built with CONFIG_DEBUG=y for -L flag\n", + argv[0]); + usage(app_usage); + goto out; +#else + rc = spdk_log_set_trace_flag(optarg); + if (rc < 0) { + fprintf(stderr, "unknown flag\n"); + usage(app_usage); + goto out; + } + opts->print_level = SPDK_LOG_DEBUG; + break; +#endif + case HUGE_UNLINK_OPT_IDX: + opts->unlink_hugepage = true; + break; + case PCI_WHITELIST_OPT_IDX: + if (opts->pci_blacklist) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + fprintf(stderr, "-B and -W cannot be used at the same time\n"); + usage(app_usage); + goto out; + } + + rc = spdk_app_opts_add_pci_addr(opts, &opts->pci_whitelist, optarg); + if (rc != 0) { + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + goto out; + } + break; + case '?': + /* + * In the event getopt() above detects an option + * in argv that is NOT in the getopt_str, + * getopt() will return a '?' indicating failure. + */ + usage(app_usage); + goto out; + default: + app_parse(ch, optarg); + } + } + + /* TBD: Replace warning by failure when RPCs for startup are prepared. */ + if (opts->config_file && opts->delay_subsystem_init) { + fprintf(stderr, + "WARNING: --wait-for-rpc and config file are used at the same time. " + "- Please be careful one options might overwrite others.\n"); + } + + retval = SPDK_APP_PARSE_ARGS_SUCCESS; +out: + if (retval != SPDK_APP_PARSE_ARGS_SUCCESS) { + free(opts->pci_blacklist); + opts->pci_blacklist = NULL; + free(opts->pci_whitelist); + opts->pci_whitelist = NULL; + } + free(cmdline_short_opts); + free(cmdline_options); + return retval; +} + +void +spdk_app_usage(void) +{ + if (g_executable_name == NULL) { + fprintf(stderr, "%s not valid before calling spdk_app_parse_args()\n", __func__); + return; + } + + usage(NULL); +} + +static void +spdk_rpc_start_subsystem_init_cpl(void *arg1, void *arg2) +{ + struct spdk_jsonrpc_request *request = arg1; + struct spdk_json_write_ctx *w; + + spdk_app_start_application(); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_start_subsystem_init(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_event *cb_event; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "start_subsystem_init requires no parameters"); + return; + } + + cb_event = spdk_event_allocate(g_init_lcore, spdk_rpc_start_subsystem_init_cpl, + request, NULL); + spdk_subsystem_init(cb_event); +} +SPDK_RPC_REGISTER("start_subsystem_init", spdk_rpc_start_subsystem_init, SPDK_RPC_STARTUP) diff --git a/src/spdk/lib/event/reactor.c b/src/spdk/lib/event/reactor.c new file mode 100644 index 00000000..d9ba9f6b --- /dev/null +++ b/src/spdk/lib/event/reactor.c @@ -0,0 +1,804 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/likely.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" + +#include "spdk/log.h" +#include "spdk/thread.h" +#include "spdk/env.h" +#include "spdk/util.h" + +#define SPDK_MAX_SOCKET 64 + +#define SPDK_EVENT_BATCH_SIZE 8 + +enum spdk_poller_state { + /* The poller is registered with a reactor but not currently executing its fn. */ + SPDK_POLLER_STATE_WAITING, + + /* The poller is currently running its fn. */ + SPDK_POLLER_STATE_RUNNING, + + /* The poller was unregistered during the execution of its fn. */ + SPDK_POLLER_STATE_UNREGISTERED, +}; + +struct spdk_poller { + TAILQ_ENTRY(spdk_poller) tailq; + uint32_t lcore; + + /* Current state of the poller; should only be accessed from the poller's thread. */ + enum spdk_poller_state state; + + uint64_t period_ticks; + uint64_t next_run_tick; + spdk_poller_fn fn; + void *arg; +}; + +enum spdk_reactor_state { + SPDK_REACTOR_STATE_INVALID = 0, + SPDK_REACTOR_STATE_INITIALIZED = 1, + SPDK_REACTOR_STATE_RUNNING = 2, + SPDK_REACTOR_STATE_EXITING = 3, + SPDK_REACTOR_STATE_SHUTDOWN = 4, +}; + +struct spdk_reactor { + /* Logical core number for this reactor. */ + uint32_t lcore; + + /* Socket ID for this reactor. */ + uint32_t socket_id; + + /* Poller for get the rusage for the reactor. */ + struct spdk_poller *rusage_poller; + + /* Reactor tsc stats */ + struct spdk_reactor_tsc_stats tsc_stats; + + uint64_t tsc_last; + + /* The last known rusage values */ + struct rusage rusage; + + /* + * Contains pollers actively running on this reactor. Pollers + * are run round-robin. The reactor takes one poller from the head + * of the ring, executes it, then puts it back at the tail of + * the ring. + */ + TAILQ_HEAD(, spdk_poller) active_pollers; + + /** + * Contains pollers running on this reactor with a periodic timer. + */ + TAILQ_HEAD(timer_pollers_head, spdk_poller) timer_pollers; + + struct spdk_ring *events; + + /* Pointer to the per-socket g_spdk_event_mempool for this reactor. */ + struct spdk_mempool *event_mempool; + + uint64_t max_delay_us; +} __attribute__((aligned(64))); + +static struct spdk_reactor *g_reactors; + +static enum spdk_reactor_state g_reactor_state = SPDK_REACTOR_STATE_INVALID; + +static bool g_context_switch_monitor_enabled = true; + +static void spdk_reactor_construct(struct spdk_reactor *w, uint32_t lcore, + uint64_t max_delay_us); + +static struct spdk_mempool *g_spdk_event_mempool[SPDK_MAX_SOCKET]; + +static struct spdk_cpuset *g_spdk_app_core_mask; + +static struct spdk_reactor * +spdk_reactor_get(uint32_t lcore) +{ + struct spdk_reactor *reactor; + reactor = spdk_likely(g_reactors) ? &g_reactors[lcore] : NULL; + return reactor; +} + +struct spdk_event * +spdk_event_allocate(uint32_t lcore, spdk_event_fn fn, void *arg1, void *arg2) +{ + struct spdk_event *event = NULL; + struct spdk_reactor *reactor = spdk_reactor_get(lcore); + + if (!reactor) { + assert(false); + return NULL; + } + + event = spdk_mempool_get(reactor->event_mempool); + if (event == NULL) { + assert(false); + return NULL; + } + + event->lcore = lcore; + event->fn = fn; + event->arg1 = arg1; + event->arg2 = arg2; + + return event; +} + +void +spdk_event_call(struct spdk_event *event) +{ + int rc; + struct spdk_reactor *reactor; + + reactor = spdk_reactor_get(event->lcore); + + assert(reactor->events != NULL); + rc = spdk_ring_enqueue(reactor->events, (void **)&event, 1); + if (rc != 1) { + assert(false); + } +} + +static inline uint32_t +_spdk_event_queue_run_batch(struct spdk_reactor *reactor) +{ + unsigned count, i; + void *events[SPDK_EVENT_BATCH_SIZE]; + +#ifdef DEBUG + /* + * spdk_ring_dequeue() fills events and returns how many entries it wrote, + * so we will never actually read uninitialized data from events, but just to be sure + * (and to silence a static analyzer false positive), initialize the array to NULL pointers. + */ + memset(events, 0, sizeof(events)); +#endif + + count = spdk_ring_dequeue(reactor->events, events, SPDK_EVENT_BATCH_SIZE); + if (count == 0) { + return 0; + } + + for (i = 0; i < count; i++) { + struct spdk_event *event = events[i]; + + assert(event != NULL); + event->fn(event->arg1, event->arg2); + } + + spdk_mempool_put_bulk(reactor->event_mempool, events, count); + + return count; +} + +static void +_spdk_reactor_msg_passed(void *arg1, void *arg2) +{ + spdk_thread_fn fn = arg1; + + fn(arg2); +} + +static void +_spdk_reactor_send_msg(spdk_thread_fn fn, void *ctx, void *thread_ctx) +{ + struct spdk_event *event; + struct spdk_reactor *reactor; + + reactor = thread_ctx; + + event = spdk_event_allocate(reactor->lcore, _spdk_reactor_msg_passed, fn, ctx); + + spdk_event_call(event); +} + +static void +_spdk_poller_insert_timer(struct spdk_reactor *reactor, struct spdk_poller *poller, uint64_t now) +{ + struct spdk_poller *iter; + uint64_t next_run_tick; + + next_run_tick = now + poller->period_ticks; + poller->next_run_tick = next_run_tick; + + /* + * Insert poller in the reactor's timer_pollers list in sorted order by next scheduled + * run time. + */ + TAILQ_FOREACH_REVERSE(iter, &reactor->timer_pollers, timer_pollers_head, tailq) { + if (iter->next_run_tick <= next_run_tick) { + TAILQ_INSERT_AFTER(&reactor->timer_pollers, iter, poller, tailq); + return; + } + } + + /* No earlier pollers were found, so this poller must be the new head */ + TAILQ_INSERT_HEAD(&reactor->timer_pollers, poller, tailq); +} + +static struct spdk_poller * +_spdk_reactor_start_poller(void *thread_ctx, + spdk_poller_fn fn, + void *arg, + uint64_t period_microseconds) +{ + struct spdk_poller *poller; + struct spdk_reactor *reactor; + uint64_t quotient, remainder, ticks; + + reactor = thread_ctx; + + poller = calloc(1, sizeof(*poller)); + if (poller == NULL) { + SPDK_ERRLOG("Poller memory allocation failed\n"); + return NULL; + } + + poller->lcore = reactor->lcore; + poller->state = SPDK_POLLER_STATE_WAITING; + poller->fn = fn; + poller->arg = arg; + + if (period_microseconds) { + quotient = period_microseconds / SPDK_SEC_TO_USEC; + remainder = period_microseconds % SPDK_SEC_TO_USEC; + ticks = spdk_get_ticks_hz(); + + poller->period_ticks = ticks * quotient + (ticks * remainder) / SPDK_SEC_TO_USEC; + } else { + poller->period_ticks = 0; + } + + if (poller->period_ticks) { + _spdk_poller_insert_timer(reactor, poller, spdk_get_ticks()); + } else { + TAILQ_INSERT_TAIL(&reactor->active_pollers, poller, tailq); + } + + return poller; +} + +static void +_spdk_reactor_stop_poller(struct spdk_poller *poller, void *thread_ctx) +{ + struct spdk_reactor *reactor; + + reactor = thread_ctx; + + assert(poller->lcore == spdk_env_get_current_core()); + + if (poller->state == SPDK_POLLER_STATE_RUNNING) { + /* + * We are being called from the poller_fn, so set the state to unregistered + * and let the reactor loop free the poller. + */ + poller->state = SPDK_POLLER_STATE_UNREGISTERED; + } else { + /* Poller is not running currently, so just free it. */ + if (poller->period_ticks) { + TAILQ_REMOVE(&reactor->timer_pollers, poller, tailq); + } else { + TAILQ_REMOVE(&reactor->active_pollers, poller, tailq); + } + + free(poller); + } +} + +static int +get_rusage(void *arg) +{ + struct spdk_reactor *reactor = arg; + struct rusage rusage; + + if (getrusage(RUSAGE_THREAD, &rusage) != 0) { + return -1; + } + + if (rusage.ru_nvcsw != reactor->rusage.ru_nvcsw || rusage.ru_nivcsw != reactor->rusage.ru_nivcsw) { + SPDK_INFOLOG(SPDK_LOG_REACTOR, + "Reactor %d: %ld voluntary context switches and %ld involuntary context switches in the last second.\n", + reactor->lcore, rusage.ru_nvcsw - reactor->rusage.ru_nvcsw, + rusage.ru_nivcsw - reactor->rusage.ru_nivcsw); + } + reactor->rusage = rusage; + + return -1; +} + +static void +_spdk_reactor_context_switch_monitor_start(void *arg1, void *arg2) +{ + struct spdk_reactor *reactor = arg1; + + if (reactor->rusage_poller == NULL) { + getrusage(RUSAGE_THREAD, &reactor->rusage); + reactor->rusage_poller = spdk_poller_register(get_rusage, reactor, 1000000); + } +} + +static void +_spdk_reactor_context_switch_monitor_stop(void *arg1, void *arg2) +{ + struct spdk_reactor *reactor = arg1; + + if (reactor->rusage_poller != NULL) { + spdk_poller_unregister(&reactor->rusage_poller); + } +} + +static size_t +_spdk_reactor_get_max_event_cnt(uint8_t socket_count) +{ + size_t cnt; + + /* Try to make event ring fill at most 2MB of memory, + * as some ring implementations may require physical address + * contingency. We don't want to introduce a requirement of + * at least 2 physically contiguous 2MB hugepages. + */ + cnt = spdk_min(262144 / socket_count, 262144 / 2); + /* Take into account one extra element required by + * some ring implementations. + */ + cnt -= 1; + return cnt; +} + +void +spdk_reactor_enable_context_switch_monitor(bool enable) +{ + struct spdk_reactor *reactor; + spdk_event_fn fn; + uint32_t core; + + if (enable != g_context_switch_monitor_enabled) { + g_context_switch_monitor_enabled = enable; + if (enable) { + fn = _spdk_reactor_context_switch_monitor_start; + } else { + fn = _spdk_reactor_context_switch_monitor_stop; + } + SPDK_ENV_FOREACH_CORE(core) { + reactor = spdk_reactor_get(core); + spdk_event_call(spdk_event_allocate(core, fn, reactor, NULL)); + } + } +} + +bool +spdk_reactor_context_switch_monitor_enabled(void) +{ + return g_context_switch_monitor_enabled; +} + +static void +spdk_reactor_add_tsc_stats(void *arg, int rc, uint64_t now) +{ + struct spdk_reactor *reactor = arg; + struct spdk_reactor_tsc_stats *tsc_stats = &reactor->tsc_stats; + + if (rc == 0) { + /* Poller status idle */ + tsc_stats->idle_tsc += now - reactor->tsc_last; + } else if (rc > 0) { + /* Poller status busy */ + tsc_stats->busy_tsc += now - reactor->tsc_last; + } else { + /* Poller status unknown */ + tsc_stats->unknown_tsc += now - reactor->tsc_last; + } + + reactor->tsc_last = now; +} + +int +spdk_reactor_get_tsc_stats(struct spdk_reactor_tsc_stats *tsc_stats, uint32_t core) +{ + struct spdk_reactor *reactor; + + if (!spdk_cpuset_get_cpu(g_spdk_app_core_mask, core)) { + return -1; + } + + reactor = spdk_reactor_get(core); + *tsc_stats = reactor->tsc_stats; + + return 0; +} + +/** + * + * \brief This is the main function of the reactor thread. + * + * \code + * + * while (1) + * if (events to run) + * dequeue and run a batch of events + * + * if (active pollers) + * run the first poller in the list and move it to the back + * + * if (first timer poller has expired) + * run the first timer poller and reinsert it in the timer list + * + * if (no action taken and sleep enabled) + * sleep until next timer poller is scheduled to expire + * \endcode + * + */ +static int +_spdk_reactor_run(void *arg) +{ + struct spdk_reactor *reactor = arg; + struct spdk_poller *poller; + uint32_t event_count; + uint64_t now; + uint64_t sleep_cycles; + uint32_t sleep_us; + int rc = -1; + char thread_name[32]; + + snprintf(thread_name, sizeof(thread_name), "reactor_%u", reactor->lcore); + if (spdk_allocate_thread(_spdk_reactor_send_msg, + _spdk_reactor_start_poller, + _spdk_reactor_stop_poller, + reactor, thread_name) == NULL) { + return -1; + } + SPDK_NOTICELOG("Reactor started on core %u on socket %u\n", reactor->lcore, + reactor->socket_id); + + sleep_cycles = reactor->max_delay_us * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; + if (g_context_switch_monitor_enabled) { + _spdk_reactor_context_switch_monitor_start(reactor, NULL); + } + now = spdk_get_ticks(); + reactor->tsc_last = now; + + while (1) { + bool took_action = false; + + event_count = _spdk_event_queue_run_batch(reactor); + if (event_count > 0) { + rc = 1; + now = spdk_get_ticks(); + spdk_reactor_add_tsc_stats(reactor, rc, now); + took_action = true; + } + + poller = TAILQ_FIRST(&reactor->active_pollers); + if (poller) { + TAILQ_REMOVE(&reactor->active_pollers, poller, tailq); + poller->state = SPDK_POLLER_STATE_RUNNING; + rc = poller->fn(poller->arg); + now = spdk_get_ticks(); + spdk_reactor_add_tsc_stats(reactor, rc, now); + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + free(poller); + } else { + poller->state = SPDK_POLLER_STATE_WAITING; + TAILQ_INSERT_TAIL(&reactor->active_pollers, poller, tailq); + } + took_action = true; + } + + poller = TAILQ_FIRST(&reactor->timer_pollers); + if (poller) { + if (took_action == false) { + now = spdk_get_ticks(); + } + + if (now >= poller->next_run_tick) { + uint64_t tmp_timer_tsc; + + TAILQ_REMOVE(&reactor->timer_pollers, poller, tailq); + poller->state = SPDK_POLLER_STATE_RUNNING; + rc = poller->fn(poller->arg); + /* Save the tsc value from before poller->fn was executed. We want to + * use the current time for idle/busy tsc value accounting, but want to + * use the older time to reinsert to the timer poller below. */ + tmp_timer_tsc = now; + now = spdk_get_ticks(); + spdk_reactor_add_tsc_stats(reactor, rc, now); + if (poller->state == SPDK_POLLER_STATE_UNREGISTERED) { + free(poller); + } else { + poller->state = SPDK_POLLER_STATE_WAITING; + _spdk_poller_insert_timer(reactor, poller, tmp_timer_tsc); + } + took_action = true; + } + } + + /* Determine if the thread can sleep */ + if (sleep_cycles && !took_action) { + now = spdk_get_ticks(); + sleep_us = reactor->max_delay_us; + + poller = TAILQ_FIRST(&reactor->timer_pollers); + if (poller) { + /* There are timers registered, so don't sleep beyond + * when the next timer should fire */ + if (poller->next_run_tick < (now + sleep_cycles)) { + if (poller->next_run_tick <= now) { + sleep_us = 0; + } else { + sleep_us = ((poller->next_run_tick - now) * + SPDK_SEC_TO_USEC) / spdk_get_ticks_hz(); + } + } + } + + if (sleep_us > 0) { + usleep(sleep_us); + } + } + + if (g_reactor_state != SPDK_REACTOR_STATE_RUNNING) { + break; + } + } + + _spdk_reactor_context_switch_monitor_stop(reactor, NULL); + spdk_free_thread(); + return 0; +} + +static void +spdk_reactor_construct(struct spdk_reactor *reactor, uint32_t lcore, uint64_t max_delay_us) +{ + reactor->lcore = lcore; + reactor->socket_id = spdk_env_get_socket_id(lcore); + assert(reactor->socket_id < SPDK_MAX_SOCKET); + reactor->max_delay_us = max_delay_us; + + TAILQ_INIT(&reactor->active_pollers); + TAILQ_INIT(&reactor->timer_pollers); + + reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, reactor->socket_id); + if (!reactor->events) { + SPDK_NOTICELOG("Ring creation failed on preferred socket %d. Try other sockets.\n", + reactor->socket_id); + + reactor->events = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, + SPDK_ENV_SOCKET_ID_ANY); + } + assert(reactor->events != NULL); + + reactor->event_mempool = g_spdk_event_mempool[reactor->socket_id]; +} + +int +spdk_app_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int ret; + struct spdk_cpuset *validmask; + + ret = spdk_cpuset_parse(cpumask, mask); + if (ret < 0) { + return ret; + } + + validmask = spdk_app_get_core_mask(); + spdk_cpuset_and(cpumask, validmask); + + return 0; +} + +struct spdk_cpuset * +spdk_app_get_core_mask(void) +{ + return g_spdk_app_core_mask; +} + + +static uint64_t +spdk_reactor_get_socket_mask(void) +{ + uint32_t i; + uint32_t socket_id; + uint64_t socket_info = 0; + + SPDK_ENV_FOREACH_CORE(i) { + socket_id = spdk_env_get_socket_id(i); + socket_info |= (1ULL << socket_id); + } + + return socket_info; +} + +void +spdk_reactors_start(void) +{ + struct spdk_reactor *reactor; + uint32_t i, current_core; + int rc; + + g_reactor_state = SPDK_REACTOR_STATE_RUNNING; + g_spdk_app_core_mask = spdk_cpuset_alloc(); + + current_core = spdk_env_get_current_core(); + SPDK_ENV_FOREACH_CORE(i) { + if (i != current_core) { + reactor = spdk_reactor_get(i); + rc = spdk_env_thread_launch_pinned(reactor->lcore, _spdk_reactor_run, reactor); + if (rc < 0) { + SPDK_ERRLOG("Unable to start reactor thread on core %u\n", reactor->lcore); + assert(false); + return; + } + } + spdk_cpuset_set_cpu(g_spdk_app_core_mask, i, true); + } + + /* Start the master reactor */ + reactor = spdk_reactor_get(current_core); + _spdk_reactor_run(reactor); + + spdk_env_thread_wait_all(); + + g_reactor_state = SPDK_REACTOR_STATE_SHUTDOWN; + spdk_cpuset_free(g_spdk_app_core_mask); + g_spdk_app_core_mask = NULL; +} + +void +spdk_reactors_stop(void *arg1, void *arg2) +{ + g_reactor_state = SPDK_REACTOR_STATE_EXITING; +} + +int +spdk_reactors_init(unsigned int max_delay_us) +{ + int rc; + uint32_t i, j, last_core; + struct spdk_reactor *reactor; + uint64_t socket_mask = 0x0; + uint8_t socket_count = 0; + char mempool_name[32]; + + socket_mask = spdk_reactor_get_socket_mask(); + SPDK_NOTICELOG("Occupied cpu socket mask is 0x%lx\n", socket_mask); + + for (i = 0; i < SPDK_MAX_SOCKET; i++) { + if ((1ULL << i) & socket_mask) { + socket_count++; + } + } + if (socket_count == 0) { + SPDK_ERRLOG("No sockets occupied (internal error)\n"); + return -1; + } + + for (i = 0; i < SPDK_MAX_SOCKET; i++) { + if ((1ULL << i) & socket_mask) { + snprintf(mempool_name, sizeof(mempool_name), "evtpool%d_%d", i, getpid()); + g_spdk_event_mempool[i] = spdk_mempool_create(mempool_name, + _spdk_reactor_get_max_event_cnt(socket_count), + sizeof(struct spdk_event), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, i); + + if (g_spdk_event_mempool[i] == NULL) { + SPDK_NOTICELOG("Event_mempool creation failed on preferred socket %d.\n", i); + + /* + * Instead of failing the operation directly, try to create + * the mempool on any available sockets in the case that + * memory is not evenly installed on all sockets. If still + * fails, free all allocated memory and exits. + */ + g_spdk_event_mempool[i] = spdk_mempool_create( + mempool_name, + _spdk_reactor_get_max_event_cnt(socket_count), + sizeof(struct spdk_event), + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + + if (g_spdk_event_mempool[i] == NULL) { + for (j = i - 1; j < i; j--) { + if (g_spdk_event_mempool[j] != NULL) { + spdk_mempool_free(g_spdk_event_mempool[j]); + } + } + SPDK_ERRLOG("spdk_event_mempool creation failed\n"); + return -1; + } + } + } else { + g_spdk_event_mempool[i] = NULL; + } + } + + /* struct spdk_reactor must be aligned on 64 byte boundary */ + last_core = spdk_env_get_last_core(); + rc = posix_memalign((void **)&g_reactors, 64, + (last_core + 1) * sizeof(struct spdk_reactor)); + if (rc != 0) { + SPDK_ERRLOG("Could not allocate array size=%u for g_reactors\n", + last_core + 1); + for (i = 0; i < SPDK_MAX_SOCKET; i++) { + if (g_spdk_event_mempool[i] != NULL) { + spdk_mempool_free(g_spdk_event_mempool[i]); + } + } + return -1; + } + + memset(g_reactors, 0, (last_core + 1) * sizeof(struct spdk_reactor)); + + SPDK_ENV_FOREACH_CORE(i) { + reactor = spdk_reactor_get(i); + spdk_reactor_construct(reactor, i, max_delay_us); + } + + g_reactor_state = SPDK_REACTOR_STATE_INITIALIZED; + + return 0; +} + +void +spdk_reactors_fini(void) +{ + uint32_t i; + struct spdk_reactor *reactor; + + SPDK_ENV_FOREACH_CORE(i) { + reactor = spdk_reactor_get(i); + if (spdk_likely(reactor != NULL) && reactor->events != NULL) { + spdk_ring_free(reactor->events); + } + } + + for (i = 0; i < SPDK_MAX_SOCKET; i++) { + if (g_spdk_event_mempool[i] != NULL) { + spdk_mempool_free(g_spdk_event_mempool[i]); + } + } + + free(g_reactors); + g_reactors = NULL; +} + +SPDK_LOG_REGISTER_COMPONENT("reactor", SPDK_LOG_REACTOR) diff --git a/src/spdk/lib/event/rpc.c b/src/spdk/lib/event/rpc.c new file mode 100644 index 00000000..f8414349 --- /dev/null +++ b/src/spdk/lib/event/rpc.c @@ -0,0 +1,82 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/log.h" +#include "spdk/rpc.h" + +#include "spdk_internal/event.h" + +#define RPC_SELECT_INTERVAL 4000 /* 4ms */ + +static struct spdk_poller *g_rpc_poller = NULL; + +static int +spdk_rpc_subsystem_poll(void *arg) +{ + spdk_rpc_accept(); + return -1; +} + +void +spdk_rpc_initialize(const char *listen_addr) +{ + int rc; + + if (listen_addr == NULL) { + return; + } + + /* Listen on the requested address */ + rc = spdk_rpc_listen(listen_addr); + if (rc != 0) { + SPDK_ERRLOG("Unable to start RPC service at %s\n", listen_addr); + return; + } + + spdk_rpc_set_state(SPDK_RPC_STARTUP); + + /* Register a poller to periodically check for RPCs */ + g_rpc_poller = spdk_poller_register(spdk_rpc_subsystem_poll, NULL, RPC_SELECT_INTERVAL); +} + +void +spdk_rpc_finish(void) +{ + spdk_rpc_close(); + spdk_poller_unregister(&g_rpc_poller); +} diff --git a/src/spdk/lib/event/rpc/Makefile b/src/spdk/lib/event/rpc/Makefile new file mode 100644 index 00000000..fcba526a --- /dev/null +++ b/src/spdk/lib/event/rpc/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = app_rpc.c subsystem_rpc.c +LIBNAME = app_rpc + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/rpc/app_rpc.c b/src/spdk/lib/event/rpc/app_rpc.c new file mode 100644 index 00000000..95cb0d2a --- /dev/null +++ b/src/spdk/lib/event/rpc/app_rpc.c @@ -0,0 +1,155 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/event.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct rpc_kill_instance { + char *sig_name; +}; + +static void +free_rpc_kill_instance(struct rpc_kill_instance *req) +{ + free(req->sig_name); +} + +static const struct spdk_json_object_decoder rpc_kill_instance_decoders[] = { + {"sig_name", offsetof(struct rpc_kill_instance, sig_name), spdk_json_decode_string}, +}; + +static void +spdk_rpc_kill_instance(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + static const struct { + const char *signal_string; + int32_t signal; + } signals[] = { + {"SIGINT", SIGINT}, + {"SIGTERM", SIGTERM}, + {"SIGQUIT", SIGQUIT}, + {"SIGHUP", SIGHUP}, + {"SIGKILL", SIGKILL}, + }; + size_t i, sig_count; + int signal; + struct rpc_kill_instance req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_kill_instance_decoders, + SPDK_COUNTOF(rpc_kill_instance_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "spdk_json_decode_object failed\n"); + goto invalid; + } + + sig_count = SPDK_COUNTOF(signals); + signal = atoi(req.sig_name); + for (i = 0 ; i < sig_count; i++) { + if (strcmp(req.sig_name, signals[i].signal_string) == 0 || + signal == signals[i].signal) { + break; + } + } + + if (i == sig_count) { + goto invalid; + } + + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "sending signal %d\n", signals[i].signal); + free_rpc_kill_instance(&req); + kill(getpid(), signals[i].signal); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_kill_instance(&req); +} +SPDK_RPC_REGISTER("kill_instance", spdk_rpc_kill_instance, SPDK_RPC_RUNTIME) + + +struct rpc_context_switch_monitor { + bool enabled; +}; + +static const struct spdk_json_object_decoder rpc_context_switch_monitor_decoders[] = { + {"enabled", offsetof(struct rpc_context_switch_monitor, enabled), spdk_json_decode_bool}, +}; + +static void +spdk_rpc_context_switch_monitor(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_context_switch_monitor req = {}; + struct spdk_json_write_ctx *w; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_context_switch_monitor_decoders, + SPDK_COUNTOF(rpc_context_switch_monitor_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_REACTOR, "spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + return; + } + + spdk_reactor_enable_context_switch_monitor(req.enabled); + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "enabled"); + spdk_json_write_bool(w, spdk_reactor_context_switch_monitor_enabled()); + + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} + +SPDK_RPC_REGISTER("context_switch_monitor", spdk_rpc_context_switch_monitor, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/event/rpc/subsystem_rpc.c b/src/spdk/lib/event/rpc/subsystem_rpc.c new file mode 100644 index 00000000..1b83990f --- /dev/null +++ b/src/spdk/lib/event/rpc/subsystem_rpc.c @@ -0,0 +1,129 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/event.h" +#include "spdk/rpc.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/env.h" + +static void +spdk_rpc_get_subsystems(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_subsystem *subsystem; + struct spdk_subsystem_depend *deps; + + if (params) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "'get_subsystems' requires no arguments"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + TAILQ_FOREACH(subsystem, &g_subsystems, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "subsystem", subsystem->name); + spdk_json_write_named_array_begin(w, "depends_on"); + TAILQ_FOREACH(deps, &g_subsystems_deps, tailq) { + if (strcmp(subsystem->name, deps->name) == 0) { + spdk_json_write_string(w, deps->depends_on); + } + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); +} + +SPDK_RPC_REGISTER("get_subsystems", spdk_rpc_get_subsystems, SPDK_RPC_RUNTIME) + +struct rpc_get_subsystem_config { + char *name; +}; + +static const struct spdk_json_object_decoder rpc_get_subsystem_config[] = { + {"name", offsetof(struct rpc_get_subsystem_config, name), spdk_json_decode_string}, +}; + +static void +rpc_get_subsystem_config_done(void *arg1, void *arg2) +{ + struct spdk_jsonrpc_request *request = arg1; + struct spdk_json_write_ctx *w = arg2; + + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_get_subsystem_config(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_subsystem_config req = {}; + struct spdk_json_write_ctx *w; + struct spdk_subsystem *subsystem; + struct spdk_event *ev; + + if (spdk_json_decode_object(params, rpc_get_subsystem_config, + SPDK_COUNTOF(rpc_get_subsystem_config), &req)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid arguments"); + return; + } + + subsystem = spdk_subsystem_find(&g_subsystems, req.name); + if (!subsystem) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Subsystem '%s' not found", req.name); + free(req.name); + return; + } + + free(req.name); + + w = spdk_jsonrpc_begin_result(request); + if (w) { + ev = spdk_event_allocate(spdk_env_get_current_core(), rpc_get_subsystem_config_done, request, w); + spdk_subsystem_config_json(w, subsystem, ev); + } +} + +SPDK_RPC_REGISTER("get_subsystem_config", spdk_rpc_get_subsystem_config, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/event/subsystem.c b/src/spdk/lib/event/subsystem.c new file mode 100644 index 00000000..438e7f54 --- /dev/null +++ b/src/spdk/lib/event/subsystem.c @@ -0,0 +1,256 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/log.h" + +#include "spdk_internal/event.h" +#include "spdk/env.h" + +struct spdk_subsystem_list g_subsystems = TAILQ_HEAD_INITIALIZER(g_subsystems); +struct spdk_subsystem_depend_list g_subsystems_deps = TAILQ_HEAD_INITIALIZER(g_subsystems_deps); +static struct spdk_subsystem *g_next_subsystem; +static bool g_subsystems_initialized = false; +static struct spdk_event *g_app_start_event; +static struct spdk_event *g_app_stop_event; +static uint32_t g_fini_core; + +void +spdk_add_subsystem(struct spdk_subsystem *subsystem) +{ + TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq); +} + +void +spdk_add_subsystem_depend(struct spdk_subsystem_depend *depend) +{ + TAILQ_INSERT_TAIL(&g_subsystems_deps, depend, tailq); +} + +struct spdk_subsystem * +spdk_subsystem_find(struct spdk_subsystem_list *list, const char *name) +{ + struct spdk_subsystem *iter; + + TAILQ_FOREACH(iter, list, tailq) { + if (strcmp(name, iter->name) == 0) { + return iter; + } + } + + return NULL; +} + +static void +subsystem_sort(void) +{ + bool depends_on, depends_on_sorted; + struct spdk_subsystem *subsystem, *subsystem_tmp; + struct spdk_subsystem_depend *subsystem_dep; + + struct spdk_subsystem_list subsystems_list = TAILQ_HEAD_INITIALIZER(subsystems_list); + + while (!TAILQ_EMPTY(&g_subsystems)) { + TAILQ_FOREACH_SAFE(subsystem, &g_subsystems, tailq, subsystem_tmp) { + depends_on = false; + TAILQ_FOREACH(subsystem_dep, &g_subsystems_deps, tailq) { + if (strcmp(subsystem->name, subsystem_dep->name) == 0) { + depends_on = true; + depends_on_sorted = !!spdk_subsystem_find(&subsystems_list, subsystem_dep->depends_on); + if (depends_on_sorted) { + continue; + } + break; + } + } + + if (depends_on == false) { + TAILQ_REMOVE(&g_subsystems, subsystem, tailq); + TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq); + } else { + if (depends_on_sorted == true) { + TAILQ_REMOVE(&g_subsystems, subsystem, tailq); + TAILQ_INSERT_TAIL(&subsystems_list, subsystem, tailq); + } + } + } + } + + TAILQ_FOREACH_SAFE(subsystem, &subsystems_list, tailq, subsystem_tmp) { + TAILQ_REMOVE(&subsystems_list, subsystem, tailq); + TAILQ_INSERT_TAIL(&g_subsystems, subsystem, tailq); + } +} + +void +spdk_subsystem_init_next(int rc) +{ + if (rc) { + SPDK_ERRLOG("Init subsystem %s failed\n", g_next_subsystem->name); + spdk_app_stop(rc); + return; + } + + if (!g_next_subsystem) { + g_next_subsystem = TAILQ_FIRST(&g_subsystems); + } else { + g_next_subsystem = TAILQ_NEXT(g_next_subsystem, tailq); + } + + if (!g_next_subsystem) { + g_subsystems_initialized = true; + spdk_event_call(g_app_start_event); + return; + } + + if (g_next_subsystem->init) { + g_next_subsystem->init(); + } else { + spdk_subsystem_init_next(0); + } +} + +static void +spdk_subsystem_verify(void *arg1, void *arg2) +{ + struct spdk_subsystem_depend *dep; + + /* Verify that all dependency name and depends_on subsystems are registered */ + TAILQ_FOREACH(dep, &g_subsystems_deps, tailq) { + if (!spdk_subsystem_find(&g_subsystems, dep->name)) { + SPDK_ERRLOG("subsystem %s is missing\n", dep->name); + spdk_app_stop(-1); + return; + } + if (!spdk_subsystem_find(&g_subsystems, dep->depends_on)) { + SPDK_ERRLOG("subsystem %s dependency %s is missing\n", + dep->name, dep->depends_on); + spdk_app_stop(-1); + return; + } + } + + subsystem_sort(); + + spdk_subsystem_init_next(0); +} + +void +spdk_subsystem_init(struct spdk_event *app_start_event) +{ + struct spdk_event *verify_event; + + g_app_start_event = app_start_event; + + verify_event = spdk_event_allocate(spdk_env_get_current_core(), spdk_subsystem_verify, NULL, NULL); + spdk_event_call(verify_event); +} + +static void +_spdk_subsystem_fini_next(void *arg1, void *arg2) +{ + assert(g_fini_core == spdk_env_get_current_core()); + + if (!g_next_subsystem) { + /* If the initialized flag is false, then we've failed to initialize + * the very first subsystem and no de-init is needed + */ + if (g_subsystems_initialized) { + g_next_subsystem = TAILQ_LAST(&g_subsystems, spdk_subsystem_list); + } + } else { + /* We rewind the g_next_subsystem unconditionally - even when some subsystem failed + * to initialize. It is assumed that subsystem which failed to initialize does not + * need to be deinitialized. + */ + g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq); + } + + while (g_next_subsystem) { + if (g_next_subsystem->fini) { + g_next_subsystem->fini(); + return; + } + g_next_subsystem = TAILQ_PREV(g_next_subsystem, spdk_subsystem_list, tailq); + } + + spdk_event_call(g_app_stop_event); + return; +} + +void +spdk_subsystem_fini_next(void) +{ + if (g_fini_core != spdk_env_get_current_core()) { + struct spdk_event *event; + + event = spdk_event_allocate(g_fini_core, _spdk_subsystem_fini_next, NULL, NULL); + spdk_event_call(event); + } else { + _spdk_subsystem_fini_next(NULL, NULL); + } +} + +void +spdk_subsystem_fini(struct spdk_event *app_stop_event) +{ + g_app_stop_event = app_stop_event; + g_fini_core = spdk_env_get_current_core(); + + spdk_subsystem_fini_next(); +} + +void +spdk_subsystem_config(FILE *fp) +{ + struct spdk_subsystem *subsystem; + + TAILQ_FOREACH(subsystem, &g_subsystems, tailq) { + if (subsystem->config) { + subsystem->config(fp); + } + } +} + +void +spdk_subsystem_config_json(struct spdk_json_write_ctx *w, struct spdk_subsystem *subsystem, + struct spdk_event *done_ev) +{ + if (subsystem && subsystem->write_config_json) { + subsystem->write_config_json(w, done_ev); + } else { + spdk_json_write_null(w); + spdk_event_call(done_ev); + } +} diff --git a/src/spdk/lib/event/subsystems/Makefile b/src/spdk/lib/event/subsystems/Makefile new file mode 100644 index 00000000..4a19160b --- /dev/null +++ b/src/spdk/lib/event/subsystems/Makefile @@ -0,0 +1,44 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +DIRS-y += bdev copy iscsi nbd net nvmf scsi vhost + +.PHONY: all clean $(DIRS-y) + +all: $(DIRS-y) +clean: $(DIRS-y) + +include $(SPDK_ROOT_DIR)/mk/spdk.subdirs.mk diff --git a/src/spdk/lib/event/subsystems/bdev/Makefile b/src/spdk/lib/event/subsystems/bdev/Makefile new file mode 100644 index 00000000..1747b759 --- /dev/null +++ b/src/spdk/lib/event/subsystems/bdev/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = bdev.c bdev_rpc.c +LIBNAME = event_bdev + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/bdev/bdev.c b/src/spdk/lib/event/subsystems/bdev/bdev.c new file mode 100644 index 00000000..5999d612 --- /dev/null +++ b/src/spdk/lib/event/subsystems/bdev/bdev.c @@ -0,0 +1,83 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/env.h" +#include "spdk/thread.h" + +#include "spdk_internal/event.h" +#include "spdk/env.h" + +static void +spdk_bdev_initialize_complete(void *cb_arg, int rc) +{ + spdk_subsystem_init_next(rc); +} + +static void +spdk_bdev_subsystem_initialize(void) +{ + spdk_bdev_initialize(spdk_bdev_initialize_complete, NULL); +} + +static void +spdk_bdev_subsystem_finish_done(void *cb_arg) +{ + spdk_subsystem_fini_next(); +} + +static void +spdk_bdev_subsystem_finish(void) +{ + spdk_bdev_finish(spdk_bdev_subsystem_finish_done, NULL); +} + +static void +_spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev) +{ + spdk_bdev_subsystem_config_json(w); + spdk_event_call(done_ev); +} + +static struct spdk_subsystem g_spdk_subsystem_bdev = { + .name = "bdev", + .init = spdk_bdev_subsystem_initialize, + .fini = spdk_bdev_subsystem_finish, + .config = spdk_bdev_config_text, + .write_config_json = _spdk_bdev_subsystem_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_bdev); +SPDK_SUBSYSTEM_DEPEND(bdev, copy) diff --git a/src/spdk/lib/event/subsystems/bdev/bdev_rpc.c b/src/spdk/lib/event/subsystems/bdev/bdev_rpc.c new file mode 100644 index 00000000..69ead5f2 --- /dev/null +++ b/src/spdk/lib/event/subsystems/bdev/bdev_rpc.c @@ -0,0 +1,97 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/bdev.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +struct spdk_rpc_set_bdev_opts { + uint32_t bdev_io_pool_size; + uint32_t bdev_io_cache_size; +}; + +static const struct spdk_json_object_decoder rpc_set_bdev_opts_decoders[] = { + {"bdev_io_pool_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_pool_size), spdk_json_decode_uint32, true}, + {"bdev_io_cache_size", offsetof(struct spdk_rpc_set_bdev_opts, bdev_io_cache_size), spdk_json_decode_uint32, true}, +}; + +static void +spdk_rpc_set_bdev_opts(struct spdk_jsonrpc_request *request, const struct spdk_json_val *params) +{ + struct spdk_rpc_set_bdev_opts rpc_opts; + struct spdk_bdev_opts bdev_opts; + struct spdk_json_write_ctx *w; + int rc; + + rpc_opts.bdev_io_pool_size = UINT32_MAX; + rpc_opts.bdev_io_cache_size = UINT32_MAX; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_set_bdev_opts_decoders, + SPDK_COUNTOF(rpc_set_bdev_opts_decoders), &rpc_opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + spdk_bdev_get_opts(&bdev_opts); + if (rpc_opts.bdev_io_pool_size != UINT32_MAX) { + bdev_opts.bdev_io_pool_size = rpc_opts.bdev_io_pool_size; + } + if (rpc_opts.bdev_io_cache_size != UINT32_MAX) { + bdev_opts.bdev_io_cache_size = rpc_opts.bdev_io_cache_size; + } + rc = spdk_bdev_set_opts(&bdev_opts); + + if (rc != 0) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Pool size %" PRIu32 " too small for cache size %" PRIu32, + bdev_opts.bdev_io_pool_size, bdev_opts.bdev_io_cache_size); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_bdev_options", spdk_rpc_set_bdev_opts, SPDK_RPC_STARTUP) diff --git a/src/spdk/lib/event/subsystems/copy/Makefile b/src/spdk/lib/event/subsystems/copy/Makefile new file mode 100644 index 00000000..691eee2a --- /dev/null +++ b/src/spdk/lib/event/subsystems/copy/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = copy.c +LIBNAME = event_copy + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/copy/copy.c b/src/spdk/lib/event/subsystems/copy/copy.c new file mode 100644 index 00000000..9bc6e281 --- /dev/null +++ b/src/spdk/lib/event/subsystems/copy/copy.c @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/copy_engine.h" + +#include "spdk_internal/event.h" +#include "spdk/env.h" + +static void +spdk_copy_engine_subsystem_initialize(void) +{ + int rc; + + rc = spdk_copy_engine_initialize(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_copy_engine_subsystem_finish_done(void *cb_arg) +{ + spdk_subsystem_fini_next(); +} + +static void +spdk_copy_engine_subsystem_finish(void) +{ + spdk_copy_engine_finish(spdk_copy_engine_subsystem_finish_done, NULL); +} + +static struct spdk_subsystem g_spdk_subsystem_copy = { + .name = "copy", + .init = spdk_copy_engine_subsystem_initialize, + .fini = spdk_copy_engine_subsystem_finish, + .config = spdk_copy_engine_config_text, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_copy); diff --git a/src/spdk/lib/event/subsystems/iscsi/Makefile b/src/spdk/lib/event/subsystems/iscsi/Makefile new file mode 100644 index 00000000..f57d9f9c --- /dev/null +++ b/src/spdk/lib/event/subsystems/iscsi/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib +C_SRCS = iscsi.c iscsi_rpc.c +LIBNAME = event_iscsi + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/iscsi/iscsi.c b/src/spdk/lib/event/subsystems/iscsi/iscsi.c new file mode 100644 index 00000000..72750398 --- /dev/null +++ b/src/spdk/lib/event/subsystems/iscsi/iscsi.c @@ -0,0 +1,81 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "iscsi/iscsi.h" + +#include "spdk_internal/event.h" + +static void +spdk_iscsi_subsystem_init_complete(void *cb_arg, int rc) +{ + spdk_subsystem_init_next(rc); +} + +static void +spdk_iscsi_subsystem_init(void) +{ + spdk_iscsi_init(spdk_iscsi_subsystem_init_complete, NULL); +} + +static void +spdk_iscsi_subsystem_fini_done(void *arg) +{ + spdk_subsystem_fini_next(); +} + +static void +spdk_iscsi_subsystem_fini(void) +{ + spdk_iscsi_fini(spdk_iscsi_subsystem_fini_done, NULL); +} + +static void +spdk_iscsi_subsystem_config_json(struct spdk_json_write_ctx *w, + struct spdk_event *done_ev) +{ + spdk_iscsi_config_json(w); + spdk_event_call(done_ev); +} + +static struct spdk_subsystem g_spdk_subsystem_iscsi = { + .name = "iscsi", + .init = spdk_iscsi_subsystem_init, + .fini = spdk_iscsi_subsystem_fini, + .config = spdk_iscsi_config_text, + .write_config_json = spdk_iscsi_subsystem_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_iscsi); +SPDK_SUBSYSTEM_DEPEND(iscsi, scsi) diff --git a/src/spdk/lib/event/subsystems/iscsi/iscsi_rpc.c b/src/spdk/lib/event/subsystems/iscsi/iscsi_rpc.c new file mode 100644 index 00000000..fb96be07 --- /dev/null +++ b/src/spdk/lib/event/subsystems/iscsi/iscsi_rpc.c @@ -0,0 +1,119 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/event.h" + +#include "spdk_internal/log.h" + +static const struct spdk_json_object_decoder rpc_set_iscsi_opts_decoders[] = { + {"auth_file", offsetof(struct spdk_iscsi_opts, authfile), spdk_json_decode_string, true}, + {"node_base", offsetof(struct spdk_iscsi_opts, nodebase), spdk_json_decode_string, true}, + {"nop_timeout", offsetof(struct spdk_iscsi_opts, timeout), spdk_json_decode_int32, true}, + {"nop_in_interval", offsetof(struct spdk_iscsi_opts, nopininterval), spdk_json_decode_int32, true}, + {"no_discovery_auth", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true}, + {"req_discovery_auth", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true}, + {"req_discovery_auth_mutual", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true}, + {"discovery_auth_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true}, + {"disable_chap", offsetof(struct spdk_iscsi_opts, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct spdk_iscsi_opts, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct spdk_iscsi_opts, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct spdk_iscsi_opts, chap_group), spdk_json_decode_int32, true}, + {"max_sessions", offsetof(struct spdk_iscsi_opts, MaxSessions), spdk_json_decode_uint32, true}, + {"max_queue_depth", offsetof(struct spdk_iscsi_opts, MaxQueueDepth), spdk_json_decode_uint32, true}, + {"max_connections_per_session", offsetof(struct spdk_iscsi_opts, MaxConnectionsPerSession), spdk_json_decode_uint32, true}, + {"default_time2wait", offsetof(struct spdk_iscsi_opts, DefaultTime2Wait), spdk_json_decode_uint32, true}, + {"default_time2retain", offsetof(struct spdk_iscsi_opts, DefaultTime2Retain), spdk_json_decode_uint32, true}, + {"first_burst_length", offsetof(struct spdk_iscsi_opts, FirstBurstLength), spdk_json_decode_uint32, true}, + {"immediate_data", offsetof(struct spdk_iscsi_opts, ImmediateData), spdk_json_decode_bool, true}, + {"error_recovery_level", offsetof(struct spdk_iscsi_opts, ErrorRecoveryLevel), spdk_json_decode_uint32, true}, + {"allow_duplicated_isid", offsetof(struct spdk_iscsi_opts, AllowDuplicateIsid), spdk_json_decode_bool, true}, + {"min_connections_per_core", offsetof(struct spdk_iscsi_opts, min_connections_per_core), spdk_json_decode_uint32, true}, +}; + +static void +spdk_rpc_iscsi_set_opts(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_iscsi_opts *opts; + struct spdk_json_write_ctx *w; + + if (g_spdk_iscsi_opts != NULL) { + SPDK_ERRLOG("this RPC must not be called more than once.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Must not call more than once"); + return; + } + + opts = spdk_iscsi_opts_alloc(); + if (opts == NULL) { + SPDK_ERRLOG("spdk_iscsi_opts_alloc() failed.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_set_iscsi_opts_decoders, + SPDK_COUNTOF(rpc_set_iscsi_opts_decoders), opts)) { + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + spdk_iscsi_opts_free(opts); + return; + } + } + + g_spdk_iscsi_opts = spdk_iscsi_opts_copy(opts); + spdk_iscsi_opts_free(opts); + + if (g_spdk_iscsi_opts == NULL) { + SPDK_ERRLOG("spdk_iscsi_opts_copy() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_iscsi_options", spdk_rpc_iscsi_set_opts, SPDK_RPC_STARTUP) diff --git a/src/spdk/lib/event/subsystems/nbd/Makefile b/src/spdk/lib/event/subsystems/nbd/Makefile new file mode 100644 index 00000000..92d99f15 --- /dev/null +++ b/src/spdk/lib/event/subsystems/nbd/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = nbd.c +LIBNAME = event_nbd + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/nbd/nbd.c b/src/spdk/lib/event/subsystems/nbd/nbd.c new file mode 100644 index 00000000..a943eb82 --- /dev/null +++ b/src/spdk/lib/event/subsystems/nbd/nbd.c @@ -0,0 +1,74 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nbd.h" + +#include "spdk_internal/event.h" + +static void +spdk_nbd_subsystem_init(void) +{ + int rc; + + rc = spdk_nbd_init(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_nbd_subsystem_fini(void) +{ + spdk_nbd_fini(); + spdk_subsystem_fini_next(); +} + +static void +spdk_nbd_subsystem_write_config_json(struct spdk_json_write_ctx *w, + struct spdk_event *done_ev) +{ + spdk_nbd_write_config_json(w); + spdk_event_call(done_ev); +} + +static struct spdk_subsystem g_spdk_subsystem_nbd = { + .name = "nbd", + .init = spdk_nbd_subsystem_init, + .fini = spdk_nbd_subsystem_fini, + .config = NULL, + .write_config_json = spdk_nbd_subsystem_write_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nbd); +SPDK_SUBSYSTEM_DEPEND(nbd, bdev) diff --git a/src/spdk/lib/event/subsystems/net/Makefile b/src/spdk/lib/event/subsystems/net/Makefile new file mode 100644 index 00000000..cf81f07b --- /dev/null +++ b/src/spdk/lib/event/subsystems/net/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = net.c +LIBNAME = event_net + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/net/net.c b/src/spdk/lib/event/subsystems/net/net.c new file mode 100644 index 00000000..9355514f --- /dev/null +++ b/src/spdk/lib/event/subsystems/net/net.c @@ -0,0 +1,91 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/net.h" + +#include "spdk_internal/event.h" + +static void +spdk_interface_subsystem_init(void) +{ + int rc; + + rc = spdk_interface_init(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_interface_subsystem_destroy(void) +{ + spdk_interface_destroy(); + spdk_subsystem_fini_next(); +} + +static struct spdk_subsystem g_spdk_subsystem_interface = { + .name = "interface", + .init = spdk_interface_subsystem_init, + .fini = spdk_interface_subsystem_destroy, + .config = NULL, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_interface); + +static void +spdk_net_subsystem_start(void) +{ + int rc; + + rc = spdk_net_framework_start(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_net_subsystem_fini(void) +{ + spdk_net_framework_fini(); + spdk_subsystem_fini_next(); +} + +static struct spdk_subsystem g_spdk_subsystem_net_framework = { + .name = "net_framework", + .init = spdk_net_subsystem_start, + .fini = spdk_net_subsystem_fini, + .config = NULL, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_net_framework); +SPDK_SUBSYSTEM_DEPEND(net_framework, interface) diff --git a/src/spdk/lib/event/subsystems/nvmf/Makefile b/src/spdk/lib/event/subsystems/nvmf/Makefile new file mode 100644 index 00000000..eca62e25 --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = conf.c nvmf_rpc.c nvmf_rpc_deprecated.c nvmf_tgt.c +LIBNAME = event_nvmf + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/nvmf/conf.c b/src/spdk/lib/event/subsystems/nvmf/conf.c new file mode 100644 index 00000000..986e81c9 --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/conf.c @@ -0,0 +1,587 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "event_nvmf.h" + +#include "spdk/conf.h" +#include "spdk/log.h" +#include "spdk/bdev.h" +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#define SPDK_NVMF_MAX_NAMESPACES (1 << 14) + +struct spdk_nvmf_tgt_opts *g_spdk_nvmf_tgt_opts = NULL; +struct spdk_nvmf_tgt_conf *g_spdk_nvmf_tgt_conf = NULL; + +static int +spdk_add_nvmf_discovery_subsystem(void) +{ + struct spdk_nvmf_subsystem *subsystem; + + subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, SPDK_NVMF_DISCOVERY_NQN, + SPDK_NVMF_SUBTYPE_DISCOVERY, 0); + if (subsystem == NULL) { + SPDK_ERRLOG("Failed creating discovery nvmf library subsystem\n"); + return -1; + } + + spdk_nvmf_subsystem_set_allow_any_host(subsystem, true); + + return 0; +} + +static void +spdk_nvmf_read_config_file_tgt_opts(struct spdk_conf_section *sp, + struct spdk_nvmf_tgt_opts *opts) +{ + int max_queue_depth; + int max_queues_per_sess; + int in_capsule_data_size; + int max_io_size; + int io_unit_size; + + max_queue_depth = spdk_conf_section_get_intval(sp, "MaxQueueDepth"); + if (max_queue_depth >= 0) { + opts->max_queue_depth = max_queue_depth; + } + + max_queues_per_sess = spdk_conf_section_get_intval(sp, "MaxQueuesPerSession"); + if (max_queues_per_sess >= 0) { + opts->max_qpairs_per_ctrlr = max_queues_per_sess; + } + + in_capsule_data_size = spdk_conf_section_get_intval(sp, "InCapsuleDataSize"); + if (in_capsule_data_size >= 0) { + opts->in_capsule_data_size = in_capsule_data_size; + } + + max_io_size = spdk_conf_section_get_intval(sp, "MaxIOSize"); + if (max_io_size >= 0) { + opts->max_io_size = max_io_size; + } + + io_unit_size = spdk_conf_section_get_intval(sp, "IOUnitSize"); + if (io_unit_size >= 0) { + opts->io_unit_size = io_unit_size; + } +} + +static void +spdk_nvmf_read_config_file_tgt_conf(struct spdk_conf_section *sp, + struct spdk_nvmf_tgt_conf *conf) +{ + int acceptor_poll_rate; + + acceptor_poll_rate = spdk_conf_section_get_intval(sp, "AcceptorPollRate"); + if (acceptor_poll_rate >= 0) { + conf->acceptor_poll_rate = acceptor_poll_rate; + } +} + +static struct spdk_nvmf_tgt_opts * +spdk_nvmf_parse_tgt_opts(void) +{ + struct spdk_nvmf_tgt_opts *opts; + struct spdk_conf_section *sp; + + opts = calloc(1, sizeof(*opts)); + if (!opts) { + SPDK_ERRLOG("calloc() failed for target options\n"); + return NULL; + } + + spdk_nvmf_tgt_opts_init(opts); + + sp = spdk_conf_find_section(NULL, "Nvmf"); + if (sp != NULL) { + spdk_nvmf_read_config_file_tgt_opts(sp, opts); + } + + return opts; +} + +static struct spdk_nvmf_tgt_conf * +spdk_nvmf_parse_tgt_conf(void) +{ + struct spdk_nvmf_tgt_conf *conf; + struct spdk_conf_section *sp; + + conf = calloc(1, sizeof(*conf)); + if (!conf) { + SPDK_ERRLOG("calloc() failed for target conf\n"); + return NULL; + } + + conf->acceptor_poll_rate = ACCEPT_TIMEOUT_US; + conf->conn_sched = DEFAULT_CONN_SCHED; + + sp = spdk_conf_find_section(NULL, "Nvmf"); + if (sp != NULL) { + spdk_nvmf_read_config_file_tgt_conf(sp, conf); + } + + return conf; +} + +static int +spdk_nvmf_parse_nvmf_tgt(void) +{ + int rc; + + if (!g_spdk_nvmf_tgt_opts) { + g_spdk_nvmf_tgt_opts = spdk_nvmf_parse_tgt_opts(); + if (!g_spdk_nvmf_tgt_opts) { + SPDK_ERRLOG("spdk_nvmf_parse_tgt_opts() failed\n"); + return -1; + } + } + + if (!g_spdk_nvmf_tgt_conf) { + g_spdk_nvmf_tgt_conf = spdk_nvmf_parse_tgt_conf(); + if (!g_spdk_nvmf_tgt_conf) { + SPDK_ERRLOG("spdk_nvmf_parse_tgt_conf() failed\n"); + return -1; + } + } + + g_spdk_nvmf_tgt = spdk_nvmf_tgt_create(g_spdk_nvmf_tgt_opts); + + free(g_spdk_nvmf_tgt_opts); + g_spdk_nvmf_tgt_opts = NULL; + + if (!g_spdk_nvmf_tgt) { + SPDK_ERRLOG("spdk_nvmf_tgt_create() failed\n"); + return -1; + } + + rc = spdk_add_nvmf_discovery_subsystem(); + if (rc != 0) { + SPDK_ERRLOG("spdk_add_nvmf_discovery_subsystem failed\n"); + return rc; + } + + return 0; +} + +static void +spdk_nvmf_tgt_listen_done(void *cb_arg, int status) +{ + /* TODO: Config parsing should wait for this operation to finish. */ + + if (status) { + SPDK_ERRLOG("Failed to listen on transport address\n"); + } +} + +static int +spdk_nvmf_parse_subsystem(struct spdk_conf_section *sp) +{ + const char *nqn, *mode; + size_t i; + int ret; + int lcore; + bool allow_any_host; + const char *sn; + struct spdk_nvmf_subsystem *subsystem; + int num_ns; + + nqn = spdk_conf_section_get_val(sp, "NQN"); + if (nqn == NULL) { + SPDK_ERRLOG("Subsystem missing NQN\n"); + return -1; + } + + mode = spdk_conf_section_get_val(sp, "Mode"); + lcore = spdk_conf_section_get_intval(sp, "Core"); + num_ns = spdk_conf_section_get_intval(sp, "MaxNamespaces"); + + if (num_ns < 1) { + num_ns = 0; + } else if (num_ns > SPDK_NVMF_MAX_NAMESPACES) { + num_ns = SPDK_NVMF_MAX_NAMESPACES; + } + + /* Mode is no longer a valid parameter, but print out a nice + * message if it exists to inform users. + */ + if (mode) { + SPDK_NOTICELOG("Mode present in the [Subsystem] section of the config file.\n" + "Mode was removed as a valid parameter.\n"); + if (strcasecmp(mode, "Virtual") == 0) { + SPDK_NOTICELOG("Your mode value is 'Virtual' which is now the only possible mode.\n" + "Your configuration file will work as expected.\n"); + } else { + SPDK_NOTICELOG("Please remove Mode from your configuration file.\n"); + return -1; + } + } + + /* Core is no longer a valid parameter, but print out a nice + * message if it exists to inform users. + */ + if (lcore >= 0) { + SPDK_NOTICELOG("Core present in the [Subsystem] section of the config file.\n" + "Core was removed as an option. Subsystems can now run on all available cores.\n"); + SPDK_NOTICELOG("Please remove Core from your configuration file. Ignoring it and continuing.\n"); + } + + sn = spdk_conf_section_get_val(sp, "SN"); + if (sn == NULL) { + SPDK_ERRLOG("Subsystem %s: missing serial number\n", nqn); + return -1; + } + + subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, nqn, SPDK_NVMF_SUBTYPE_NVME, num_ns); + if (subsystem == NULL) { + goto done; + } + + if (spdk_nvmf_subsystem_set_sn(subsystem, sn)) { + SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", nqn, sn); + spdk_nvmf_subsystem_destroy(subsystem); + subsystem = NULL; + goto done; + } + + for (i = 0; ; i++) { + struct spdk_nvmf_ns_opts ns_opts; + struct spdk_bdev *bdev; + const char *bdev_name; + const char *uuid_str; + char *nsid_str; + + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + break; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("Could not find namespace bdev '%s'\n", bdev_name); + spdk_nvmf_subsystem_destroy(subsystem); + subsystem = NULL; + goto done; + } + + spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts)); + + nsid_str = spdk_conf_section_get_nmval(sp, "Namespace", i, 1); + if (nsid_str) { + char *end; + unsigned long nsid_ul = strtoul(nsid_str, &end, 0); + + if (*end != '\0' || nsid_ul == 0 || nsid_ul >= UINT32_MAX) { + SPDK_ERRLOG("Invalid NSID %s\n", nsid_str); + spdk_nvmf_subsystem_destroy(subsystem); + subsystem = NULL; + goto done; + } + + ns_opts.nsid = (uint32_t)nsid_ul; + } + + uuid_str = spdk_conf_section_get_nmval(sp, "Namespace", i, 2); + if (uuid_str) { + if (spdk_uuid_parse(&ns_opts.uuid, uuid_str)) { + SPDK_ERRLOG("Invalid UUID %s\n", uuid_str); + spdk_nvmf_subsystem_destroy(subsystem); + subsystem = NULL; + goto done; + } + } + + if (spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts)) == 0) { + SPDK_ERRLOG("Unable to add namespace\n"); + spdk_nvmf_subsystem_destroy(subsystem); + subsystem = NULL; + goto done; + } + + SPDK_INFOLOG(SPDK_LOG_NVMF, "Attaching block device %s to subsystem %s\n", + spdk_bdev_get_name(bdev), spdk_nvmf_subsystem_get_nqn(subsystem)); + } + + /* Parse Listen sections */ + for (i = 0; ; i++) { + struct spdk_nvme_transport_id trid = {0}; + const char *transport; + const char *address; + char *address_dup; + char *host; + char *port; + + transport = spdk_conf_section_get_nmval(sp, "Listen", i, 0); + if (!transport) { + break; + } + + if (spdk_nvme_transport_id_parse_trtype(&trid.trtype, transport)) { + SPDK_ERRLOG("Invalid listen address transport type '%s'\n", transport); + continue; + } + + address = spdk_conf_section_get_nmval(sp, "Listen", i, 1); + if (!address) { + break; + } + + address_dup = strdup(address); + if (!address_dup) { + break; + } + + ret = spdk_parse_ip_addr(address_dup, &host, &port); + if (ret < 0) { + SPDK_ERRLOG("Unable to parse listen address '%s'\n", address); + free(address_dup); + continue; + } + + if (strchr(host, ':')) { + trid.adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else { + trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; + } + + snprintf(trid.traddr, sizeof(trid.traddr), "%s", host); + if (port) { + snprintf(trid.trsvcid, sizeof(trid.trsvcid), "%s", port); + } + free(address_dup); + + spdk_nvmf_tgt_listen(g_spdk_nvmf_tgt, &trid, spdk_nvmf_tgt_listen_done, NULL); + + spdk_nvmf_subsystem_add_listener(subsystem, &trid); + } + + /* Parse Host sections */ + for (i = 0; ; i++) { + const char *host = spdk_conf_section_get_nval(sp, "Host", i); + + if (!host) { + break; + } + + spdk_nvmf_subsystem_add_host(subsystem, host); + } + + allow_any_host = spdk_conf_section_get_boolval(sp, "AllowAnyHost", false); + spdk_nvmf_subsystem_set_allow_any_host(subsystem, allow_any_host); + +done: + return (subsystem != NULL); +} + +static int +spdk_nvmf_parse_subsystems(void) +{ + int rc = 0; + struct spdk_conf_section *sp; + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "Subsystem")) { + rc = spdk_nvmf_parse_subsystem(sp); + if (rc < 0) { + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +struct spdk_nvmf_parse_transport_ctx { + struct spdk_conf_section *sp; + spdk_nvmf_parse_conf_done_fn cb_fn; +}; + +static void spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx); + +static void +spdk_nvmf_tgt_add_transport_done(void *cb_arg, int status) +{ + struct spdk_nvmf_parse_transport_ctx *ctx = cb_arg; + int rc; + + if (status < 0) { + SPDK_ERRLOG("Add transport to target failed (%d).\n", status); + ctx->cb_fn(status); + free(ctx); + return; + } + + /* find next transport */ + ctx->sp = spdk_conf_next_section(ctx->sp); + while (ctx->sp) { + if (spdk_conf_section_match_prefix(ctx->sp, "Transport")) { + spdk_nvmf_parse_transport(ctx); + return; + } + ctx->sp = spdk_conf_next_section(ctx->sp); + } + + /* done with transports, parse Subsystem sections */ + rc = spdk_nvmf_parse_subsystems(); + + ctx->cb_fn(rc); + free(ctx); +} + +static void +spdk_nvmf_parse_transport(struct spdk_nvmf_parse_transport_ctx *ctx) +{ + const char *type; + struct spdk_nvmf_transport_opts opts = { 0 }; + enum spdk_nvme_transport_type trtype; + struct spdk_nvmf_transport *transport; + int val; + + type = spdk_conf_section_get_val(ctx->sp, "Type"); + if (type == NULL) { + SPDK_ERRLOG("Transport missing Type\n"); + ctx->cb_fn(-1); + free(ctx); + return; + } + + if (spdk_nvme_transport_id_parse_trtype(&trtype, type)) { + SPDK_ERRLOG("Invalid transport type '%s'\n", type); + ctx->cb_fn(-1); + free(ctx); + return; + } + + if (spdk_nvmf_tgt_get_transport(g_spdk_nvmf_tgt, trtype)) { + SPDK_ERRLOG("Duplicate transport type '%s'\n", type); + ctx->cb_fn(-1); + free(ctx); + return; + } + + if (!spdk_nvmf_transport_opts_init(trtype, &opts)) { + ctx->cb_fn(-1); + free(ctx); + return; + } + + val = spdk_conf_section_get_intval(ctx->sp, "MaxQueueDepth"); + if (val >= 0) { + opts.max_queue_depth = val; + } + val = spdk_conf_section_get_intval(ctx->sp, "MaxQueuesPerSession"); + if (val >= 0) { + opts.max_qpairs_per_ctrlr = val; + } + val = spdk_conf_section_get_intval(ctx->sp, "InCapsuleDataSize"); + if (val >= 0) { + opts.in_capsule_data_size = val; + } + val = spdk_conf_section_get_intval(ctx->sp, "MaxIOSize"); + if (val >= 0) { + opts.max_io_size = val; + } + val = spdk_conf_section_get_intval(ctx->sp, "IOUnitSize"); + if (val >= 0) { + opts.io_unit_size = val; + } + val = spdk_conf_section_get_intval(ctx->sp, "MaxAQDepth"); + if (val >= 0) { + opts.max_aq_depth = val; + } + + transport = spdk_nvmf_transport_create(trtype, &opts); + if (transport) { + spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, spdk_nvmf_tgt_add_transport_done, ctx); + } else { + ctx->cb_fn(-1); + free(ctx); + return; + } +} + +static int +spdk_nvmf_parse_transports(spdk_nvmf_parse_conf_done_fn cb_fn) +{ + struct spdk_nvmf_parse_transport_ctx *ctx; + + ctx = calloc(1, sizeof(struct spdk_nvmf_parse_transport_ctx)); + if (!ctx) { + SPDK_ERRLOG("Failed alloc of context memory for parse transports\n"); + return -ENOMEM; + } + + ctx->cb_fn = cb_fn; + ctx->sp = spdk_conf_first_section(NULL); + while (ctx->sp != NULL) { + if (spdk_conf_section_match_prefix(ctx->sp, "Transport")) { + spdk_nvmf_parse_transport(ctx); + return 0; + } + ctx->sp = spdk_conf_next_section(ctx->sp); + } + + /* if we get here, there are no transports defined in conf file */ + free(ctx); + cb_fn(spdk_nvmf_parse_subsystems()); + + return 0; +} + +int +spdk_nvmf_parse_conf(spdk_nvmf_parse_conf_done_fn cb_fn) +{ + int rc; + + if (cb_fn == NULL) { + SPDK_ERRLOG("Callback function is NULL\n"); + return -1; + } + + /* NVMf section */ + rc = spdk_nvmf_parse_nvmf_tgt(); + if (rc < 0) { + return rc; + } + + /* Transport sections */ + rc = spdk_nvmf_parse_transports(cb_fn); + if (rc < 0) { + return rc; + } + + return 0; +} diff --git a/src/spdk/lib/event/subsystems/nvmf/event_nvmf.h b/src/spdk/lib/event/subsystems/nvmf/event_nvmf.h new file mode 100644 index 00000000..50e5d755 --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/event_nvmf.h @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NVMF_TGT_H +#define NVMF_TGT_H + +#include "spdk/stdinc.h" + +#include "spdk/nvmf.h" +#include "spdk/queue.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" + +#define ACCEPT_TIMEOUT_US 10000 /* 10ms */ +#define DEFAULT_CONN_SCHED CONNECT_SCHED_ROUND_ROBIN + +enum spdk_nvmf_connect_sched { + CONNECT_SCHED_ROUND_ROBIN = 0, + CONNECT_SCHED_HOST_IP, +}; + +struct spdk_nvmf_tgt_conf { + uint32_t acceptor_poll_rate; + enum spdk_nvmf_connect_sched conn_sched; +}; + +extern struct spdk_nvmf_tgt_opts *g_spdk_nvmf_tgt_opts; +extern struct spdk_nvmf_tgt_conf *g_spdk_nvmf_tgt_conf; + +extern struct spdk_nvmf_tgt *g_spdk_nvmf_tgt; + +typedef void (*spdk_nvmf_parse_conf_done_fn)(int status); + +int spdk_nvmf_parse_conf(spdk_nvmf_parse_conf_done_fn cb_fn); + +#endif diff --git a/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc.c b/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc.c new file mode 100644 index 00000000..e4114afe --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc.c @@ -0,0 +1,1562 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "event_nvmf.h" + +#include "spdk/bdev.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/string.h" +#include "spdk/util.h" + +static int +json_write_hex_str(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + static const char hex_char[16] = "0123456789ABCDEF"; + const uint8_t *buf = data; + char *str, *out; + int rc; + + str = malloc(size * 2 + 1); + if (str == NULL) { + return -1; + } + + out = str; + while (size--) { + unsigned byte = *buf++; + + out[0] = hex_char[(byte >> 4) & 0xF]; + out[1] = hex_char[byte & 0xF]; + + out += 2; + } + *out = '\0'; + + rc = spdk_json_write_string(w, str); + free(str); + + return rc; +} + +static int +hex_nybble_to_num(char c) +{ + if (c >= '0' && c <= '9') { + return c - '0'; + } + + if (c >= 'a' && c <= 'f') { + return c - 'a' + 0xA; + } + + if (c >= 'A' && c <= 'F') { + return c - 'A' + 0xA; + } + + return -1; +} + +static int +hex_byte_to_num(const char *str) +{ + int hi, lo; + + hi = hex_nybble_to_num(str[0]); + if (hi < 0) { + return hi; + } + + lo = hex_nybble_to_num(str[1]); + if (lo < 0) { + return lo; + } + + return hi * 16 + lo; +} + +static int +decode_hex_string_be(const char *str, uint8_t *out, size_t size) +{ + size_t i; + + /* Decode a string in "ABCDEF012345" format to its binary representation */ + for (i = 0; i < size; i++) { + int num = hex_byte_to_num(str); + + if (num < 0) { + /* Invalid hex byte or end of string */ + return -1; + } + + out[i] = (uint8_t)num; + str += 2; + } + + if (i != size || *str != '\0') { + /* Length mismatch */ + return -1; + } + + return 0; +} + +static int +decode_ns_nguid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 16-byte NGUID */ + rc = decode_hex_string_be(str, out, 16); + } + + free(str); + return rc; +} + +static int +decode_ns_eui64(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 8-byte EUI-64 */ + rc = decode_hex_string_be(str, out, 8); + } + + free(str); + return rc; +} + +static int +decode_ns_uuid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + rc = spdk_uuid_parse(out, str); + } + + free(str); + return rc; +} + +static void +dump_nvmf_subsystem(struct spdk_json_write_ctx *w, struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host; + struct spdk_nvmf_listener *listener; + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "nqn"); + spdk_json_write_string(w, spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_name(w, "subtype"); + if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) { + spdk_json_write_string(w, "NVMe"); + } else { + spdk_json_write_string(w, "Discovery"); + } + + spdk_json_write_name(w, "listen_addresses"); + spdk_json_write_array_begin(w); + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + const struct spdk_nvme_transport_id *trid; + const char *trtype; + const char *adrfam; + + trid = spdk_nvmf_listener_get_trid(listener); + + spdk_json_write_object_begin(w); + trtype = spdk_nvme_transport_id_trtype_str(trid->trtype); + if (trtype == NULL) { + trtype = "unknown"; + } + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + if (adrfam == NULL) { + adrfam = "unknown"; + } + /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ + spdk_json_write_name(w, "transport"); + spdk_json_write_string(w, trtype); + spdk_json_write_name(w, "trtype"); + spdk_json_write_string(w, trtype); + spdk_json_write_name(w, "adrfam"); + spdk_json_write_string(w, adrfam); + spdk_json_write_name(w, "traddr"); + spdk_json_write_string(w, trid->traddr); + spdk_json_write_name(w, "trsvcid"); + spdk_json_write_string(w, trid->trsvcid); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_name(w, "allow_any_host"); + spdk_json_write_bool(w, spdk_nvmf_subsystem_get_allow_any_host(subsystem)); + + spdk_json_write_name(w, "hosts"); + spdk_json_write_array_begin(w); + + for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL; + host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) { + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "nqn"); + spdk_json_write_string(w, spdk_nvmf_host_get_nqn(host)); + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + if (spdk_nvmf_subsystem_get_type(subsystem) == SPDK_NVMF_SUBTYPE_NVME) { + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_ns_opts ns_opts; + uint32_t max_namespaces; + + spdk_json_write_name(w, "serial_number"); + spdk_json_write_string(w, spdk_nvmf_subsystem_get_sn(subsystem)); + + max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem); + if (max_namespaces != 0) { + spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces); + } + + spdk_json_write_name(w, "namespaces"); + spdk_json_write_array_begin(w); + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts)); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "nsid"); + spdk_json_write_int32(w, spdk_nvmf_ns_get_id(ns)); + spdk_json_write_name(w, "bdev_name"); + spdk_json_write_string(w, spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + /* NOTE: "name" is kept for compatibility only - new code should use bdev_name. */ + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + + if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) { + spdk_json_write_name(w, "nguid"); + json_write_hex_str(w, ns_opts.nguid, sizeof(ns_opts.nguid)); + } + + if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) { + spdk_json_write_name(w, "eui64"); + json_write_hex_str(w, ns_opts.eui64, sizeof(ns_opts.eui64)); + } + + if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) { + char uuid_str[SPDK_UUID_STRING_LEN]; + + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid); + spdk_json_write_name(w, "uuid"); + spdk_json_write_string(w, uuid_str); + } + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + } + spdk_json_write_object_end(w); +} + +static void +spdk_rpc_get_nvmf_subsystems(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_nvmf_subsystem *subsystem; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_nvmf_subsystems requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); + while (subsystem) { + dump_nvmf_subsystem(w, subsystem); + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_nvmf_subsystems", spdk_rpc_get_nvmf_subsystems, SPDK_RPC_RUNTIME) + +struct rpc_subsystem_create { + char *nqn; + char *serial_number; + uint32_t max_namespaces; + bool allow_any_host; +}; + +static const struct spdk_json_object_decoder rpc_subsystem_create_decoders[] = { + {"nqn", offsetof(struct rpc_subsystem_create, nqn), spdk_json_decode_string}, + {"serial_number", offsetof(struct rpc_subsystem_create, serial_number), spdk_json_decode_string, true}, + {"max_namespaces", offsetof(struct rpc_subsystem_create, max_namespaces), spdk_json_decode_uint32, true}, + {"allow_any_host", offsetof(struct rpc_subsystem_create, allow_any_host), spdk_json_decode_bool, true}, +}; + +static void +spdk_rpc_nvmf_subsystem_started(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +spdk_rpc_nvmf_subsystem_create(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_subsystem_create *req; + struct spdk_nvmf_subsystem *subsystem; + + req = calloc(1, sizeof(*req)); + if (!req) { + goto invalid; + } + + if (spdk_json_decode_object(params, rpc_subsystem_create_decoders, + SPDK_COUNTOF(rpc_subsystem_create_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, req->nqn, SPDK_NVMF_SUBTYPE_NVME, + req->max_namespaces); + if (!subsystem) { + goto invalid; + } + + if (req->serial_number) { + if (spdk_nvmf_subsystem_set_sn(subsystem, req->serial_number)) { + SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", req->nqn, req->serial_number); + goto invalid; + } + } + + spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host); + + free(req->nqn); + free(req->serial_number); + free(req); + + spdk_nvmf_subsystem_start(subsystem, + spdk_rpc_nvmf_subsystem_started, + request); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + if (req) { + free(req->nqn); + free(req->serial_number); + } + free(req); +} +SPDK_RPC_REGISTER("nvmf_subsystem_create", spdk_rpc_nvmf_subsystem_create, SPDK_RPC_RUNTIME) + +struct rpc_delete_subsystem { + char *nqn; +}; + +static void +free_rpc_delete_subsystem(struct rpc_delete_subsystem *r) +{ + free(r->nqn); +} + +static void +spdk_rpc_nvmf_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + spdk_nvmf_subsystem_destroy(subsystem); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static const struct spdk_json_object_decoder rpc_delete_subsystem_decoders[] = { + {"nqn", offsetof(struct rpc_delete_subsystem, nqn), spdk_json_decode_string}, +}; + +static void +spdk_rpc_delete_nvmf_subsystem(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_subsystem req = {}; + struct spdk_nvmf_subsystem *subsystem; + + if (spdk_json_decode_object(params, rpc_delete_subsystem_decoders, + SPDK_COUNTOF(rpc_delete_subsystem_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.nqn == NULL) { + SPDK_ERRLOG("missing name param\n"); + goto invalid; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, req.nqn); + if (!subsystem) { + goto invalid; + } + + free_rpc_delete_subsystem(&req); + + spdk_nvmf_subsystem_stop(subsystem, + spdk_rpc_nvmf_subsystem_stopped, + request); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_delete_subsystem(&req); +} +SPDK_RPC_REGISTER("delete_nvmf_subsystem", spdk_rpc_delete_nvmf_subsystem, SPDK_RPC_RUNTIME) + +struct rpc_listen_address { + char *transport; + char *adrfam; + char *traddr; + char *trsvcid; +}; + +#define RPC_MAX_LISTEN_ADDRESSES 255 +#define RPC_MAX_NAMESPACES 255 + +struct rpc_listen_addresses { + size_t num_listen_address; + struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES]; +}; + +static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = { + /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ + {"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"trtype", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"adrfam", offsetof(struct rpc_listen_address, adrfam), spdk_json_decode_string, true}, + {"traddr", offsetof(struct rpc_listen_address, traddr), spdk_json_decode_string}, + {"trsvcid", offsetof(struct rpc_listen_address, trsvcid), spdk_json_decode_string}, +}; + +static int +decode_rpc_listen_address(const struct spdk_json_val *val, void *out) +{ + struct rpc_listen_address *req = (struct rpc_listen_address *)out; + if (spdk_json_decode_object(val, rpc_listen_address_decoders, + SPDK_COUNTOF(rpc_listen_address_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + return -1; + } + return 0; +} + +static void +free_rpc_listen_address(struct rpc_listen_address *r) +{ + free(r->transport); + free(r->adrfam); + free(r->traddr); + free(r->trsvcid); +} + +enum nvmf_rpc_listen_op { + NVMF_RPC_LISTEN_ADD, + NVMF_RPC_LISTEN_REMOVE, +}; + +struct nvmf_rpc_listener_ctx { + char *nqn; + struct spdk_nvmf_subsystem *subsystem; + struct rpc_listen_address address; + + struct spdk_jsonrpc_request *request; + struct spdk_nvme_transport_id trid; + enum nvmf_rpc_listen_op op; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_listener_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_listener_ctx, nqn), spdk_json_decode_string}, + {"listen_address", offsetof(struct nvmf_rpc_listener_ctx, address), decode_rpc_listen_address}, +}; + +static void +nvmf_rpc_listener_ctx_free(struct nvmf_rpc_listener_ctx *ctx) +{ + free(ctx->nqn); + free_rpc_listen_address(&ctx->address); + free(ctx); +} + +static void +nvmf_rpc_listen_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + + request = ctx->request; + if (ctx->response_sent) { + /* If an error occurred, the response has already been sent. */ + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + nvmf_rpc_listener_ctx_free(ctx); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_tgt_listen(void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + + if (status) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } else { + if (spdk_nvmf_subsystem_add_listener(ctx->subsystem, &ctx->trid)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + } + + if (spdk_nvmf_subsystem_resume(ctx->subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} + +static void +nvmf_rpc_listen_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_listener_ctx *ctx = cb_arg; + + if (ctx->op == NVMF_RPC_LISTEN_ADD) { + spdk_nvmf_tgt_listen(g_spdk_nvmf_tgt, &ctx->trid, nvmf_rpc_tgt_listen, ctx); + return; + } else if (ctx->op == NVMF_RPC_LISTEN_REMOVE) { + if (spdk_nvmf_subsystem_remove_listener(subsystem, &ctx->trid)) { + SPDK_ERRLOG("Unable to remove listener.\n"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + } else { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_listen_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_listener_ctx_free(ctx); + /* Can't really do anything to recover here - subsystem will remain paused. */ + } +} + +static int +rpc_listen_address_to_trid(const struct rpc_listen_address *address, + struct spdk_nvme_transport_id *trid) +{ + size_t len; + + memset(trid, 0, sizeof(*trid)); + + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, address->transport)) { + SPDK_ERRLOG("Invalid transport type: %s\n", address->transport); + return -EINVAL; + } + + if (address->adrfam) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, address->adrfam)) { + SPDK_ERRLOG("Invalid adrfam: %s\n", address->adrfam); + return -EINVAL; + } + } else { + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + } + + len = strlen(address->traddr); + if (len > sizeof(trid->traddr) - 1) { + SPDK_ERRLOG("Transport address longer than %zu characters: %s\n", + sizeof(trid->traddr) - 1, address->traddr); + return -EINVAL; + } + memcpy(trid->traddr, address->traddr, len + 1); + + len = strlen(address->trsvcid); + if (len > sizeof(trid->trsvcid) - 1) { + SPDK_ERRLOG("Transport service id longer than %zu characters: %s\n", + sizeof(trid->trsvcid) - 1, address->trsvcid); + return -EINVAL; + } + memcpy(trid->trsvcid, address->trsvcid, len + 1); + + return 0; +} + +static void +nvmf_rpc_subsystem_add_listener(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_listener_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder, + SPDK_COUNTOF(nvmf_rpc_listener_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->op = NVMF_RPC_LISTEN_ADD; + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_listener", nvmf_rpc_subsystem_add_listener, SPDK_RPC_RUNTIME); + +static void +nvmf_rpc_subsystem_remove_listener(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_listener_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + ctx->request = request; + + if (spdk_json_decode_object(params, nvmf_rpc_listener_decoder, + SPDK_COUNTOF(nvmf_rpc_listener_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->subsystem = subsystem; + + if (rpc_listen_address_to_trid(&ctx->address, &ctx->trid)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + + ctx->op = NVMF_RPC_LISTEN_REMOVE; + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_listen_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_listener_ctx_free(ctx); + return; + } + +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_listener", nvmf_rpc_subsystem_remove_listener, + SPDK_RPC_RUNTIME); + +struct spdk_nvmf_ns_params { + char *bdev_name; + uint32_t nsid; + char nguid[16]; + char eui64[8]; + struct spdk_uuid uuid; +}; + +struct rpc_namespaces { + size_t num_ns; + struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES]; +}; + + +static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = { + {"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true}, + {"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string}, + {"nguid", offsetof(struct spdk_nvmf_ns_params, nguid), decode_ns_nguid, true}, + {"eui64", offsetof(struct spdk_nvmf_ns_params, eui64), decode_ns_eui64, true}, + {"uuid", offsetof(struct spdk_nvmf_ns_params, uuid), decode_ns_uuid, true}, +}; + +static int +decode_rpc_ns_params(const struct spdk_json_val *val, void *out) +{ + struct spdk_nvmf_ns_params *ns_params = out; + + return spdk_json_decode_object(val, rpc_ns_params_decoders, + SPDK_COUNTOF(rpc_ns_params_decoders), + ns_params); +} + +struct nvmf_rpc_ns_ctx { + char *nqn; + struct spdk_nvmf_ns_params ns_params; + + struct spdk_jsonrpc_request *request; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_ns_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_ns_ctx, nqn), spdk_json_decode_string}, + {"namespace", offsetof(struct nvmf_rpc_ns_ctx, ns_params), decode_rpc_ns_params}, +}; + +static void +nvmf_rpc_ns_ctx_free(struct nvmf_rpc_ns_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->ns_params.bdev_name); + free(ctx); +} + +static void +nvmf_rpc_ns_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_ns_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request = ctx->request; + uint32_t nsid = ctx->ns_params.nsid; + bool response_sent = ctx->response_sent; + struct spdk_json_write_ctx *w; + + nvmf_rpc_ns_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_uint32(w, nsid); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_ns_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_ns_ctx *ctx = cb_arg; + struct spdk_nvmf_ns_opts ns_opts; + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(ctx->ns_params.bdev_name); + if (!bdev) { + SPDK_ERRLOG("No bdev with name %s\n", ctx->ns_params.bdev_name); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + goto resume; + } + + spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts)); + ns_opts.nsid = ctx->ns_params.nsid; + + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(ctx->ns_params.nguid), "size mismatch"); + memcpy(ns_opts.nguid, ctx->ns_params.nguid, sizeof(ns_opts.nguid)); + + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(ctx->ns_params.eui64), "size mismatch"); + memcpy(ns_opts.eui64, ctx->ns_params.eui64, sizeof(ns_opts.eui64)); + + if (!spdk_mem_all_zero(&ctx->ns_params.uuid, sizeof(ctx->ns_params.uuid))) { + ns_opts.uuid = ctx->ns_params.uuid; + } + + ctx->ns_params.nsid = spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts)); + if (ctx->ns_params.nsid == 0) { + SPDK_ERRLOG("Unable to add namespace\n"); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + goto resume; + } + +resume: + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_ns_resumed, ctx)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } +} + +static void +nvmf_rpc_subsystem_add_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_ns_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_ns_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_ns_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_ns_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_ns_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_ns", nvmf_rpc_subsystem_add_ns, SPDK_RPC_RUNTIME) + +struct nvmf_rpc_remove_ns_ctx { + char *nqn; + uint32_t nsid; + + struct spdk_jsonrpc_request *request; + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_remove_ns_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_remove_ns_ctx, nqn), spdk_json_decode_string}, + {"nsid", offsetof(struct nvmf_rpc_remove_ns_ctx, nsid), spdk_json_decode_uint32}, +}; + +static void +nvmf_rpc_remove_ns_ctx_free(struct nvmf_rpc_remove_ns_ctx *ctx) +{ + free(ctx->nqn); + free(ctx); +} + +static void +nvmf_rpc_remove_ns_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request = ctx->request; + bool response_sent = ctx->response_sent; + struct spdk_json_write_ctx *w; + + nvmf_rpc_remove_ns_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_remove_ns_remove_done(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + struct nvmf_rpc_remove_ns_ctx *ctx; + + ctx = cb_arg; + + if (status != 0) { + SPDK_ERRLOG("Unable to remove namespace ID %u\n", ctx->nsid); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_remove_ns_resumed, ctx)) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } +} + +static void +nvmf_rpc_remove_ns_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_remove_ns_ctx *ctx = cb_arg; + int ret; + + ret = spdk_nvmf_subsystem_remove_ns(subsystem, ctx->nsid, nvmf_rpc_remove_ns_remove_done, ctx); + if (ret < 0) { + SPDK_ERRLOG("Unable to remove namespace ID %u\n", ctx->nsid); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + ctx->response_sent = true; + spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_remove_ns_resumed, ctx); + } +} + +static void +nvmf_rpc_subsystem_remove_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_remove_ns_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_remove_ns_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_remove_ns_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_remove_ns_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_remove_ns_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_ns", nvmf_rpc_subsystem_remove_ns, SPDK_RPC_RUNTIME) + +enum nvmf_rpc_host_op { + NVMF_RPC_HOST_ADD, + NVMF_RPC_HOST_REMOVE, + NVMF_RPC_HOST_ALLOW_ANY, +}; + +struct nvmf_rpc_host_ctx { + struct spdk_jsonrpc_request *request; + + char *nqn; + char *host; + + enum nvmf_rpc_host_op op; + + bool allow_any_host; + + bool response_sent; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_host_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string}, + {"host", offsetof(struct nvmf_rpc_host_ctx, host), spdk_json_decode_string}, +}; + +static void +nvmf_rpc_host_ctx_free(struct nvmf_rpc_host_ctx *ctx) +{ + free(ctx->nqn); + free(ctx->host); + free(ctx); +} + +static void +nvmf_rpc_host_resumed(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_host_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + bool response_sent = ctx->response_sent; + + request = ctx->request; + nvmf_rpc_host_ctx_free(ctx); + + if (response_sent) { + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_host_paused(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct nvmf_rpc_host_ctx *ctx = cb_arg; + int rc = -1; + + switch (ctx->op) { + case NVMF_RPC_HOST_ADD: + rc = spdk_nvmf_subsystem_add_host(subsystem, ctx->host); + break; + case NVMF_RPC_HOST_REMOVE: + rc = spdk_nvmf_subsystem_remove_host(subsystem, ctx->host); + break; + case NVMF_RPC_HOST_ALLOW_ANY: + rc = spdk_nvmf_subsystem_set_allow_any_host(subsystem, ctx->allow_any_host); + break; + } + + if (rc != 0) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + ctx->response_sent = true; + } + + if (spdk_nvmf_subsystem_resume(subsystem, nvmf_rpc_host_resumed, ctx)) { + if (!ctx->response_sent) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + } + nvmf_rpc_host_ctx_free(ctx); + return; + } +} + +static void +nvmf_rpc_subsystem_add_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_ADD; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_add_host", nvmf_rpc_subsystem_add_host, SPDK_RPC_RUNTIME) + +static void +nvmf_rpc_subsystem_remove_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_REMOVE; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_remove_host", nvmf_rpc_subsystem_remove_host, SPDK_RPC_RUNTIME) + + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_any_host_decoder[] = { + {"nqn", offsetof(struct nvmf_rpc_host_ctx, nqn), spdk_json_decode_string}, + {"allow_any_host", offsetof(struct nvmf_rpc_host_ctx, allow_any_host), spdk_json_decode_bool}, +}; + +static void +nvmf_rpc_subsystem_allow_any_host(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_host_ctx *ctx; + struct spdk_nvmf_subsystem *subsystem; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_any_host_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_any_host_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + ctx->request = request; + ctx->op = NVMF_RPC_HOST_ALLOW_ANY; + ctx->response_sent = false; + + subsystem = spdk_nvmf_tgt_find_subsystem(g_spdk_nvmf_tgt, ctx->nqn); + if (!subsystem) { + SPDK_ERRLOG("Unable to find subsystem with NQN %s\n", ctx->nqn); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_host_ctx_free(ctx); + return; + } + + if (spdk_nvmf_subsystem_pause(subsystem, nvmf_rpc_host_paused, ctx)) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Internal error"); + nvmf_rpc_host_ctx_free(ctx); + return; + } +} +SPDK_RPC_REGISTER("nvmf_subsystem_allow_any_host", nvmf_rpc_subsystem_allow_any_host, + SPDK_RPC_RUNTIME) + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_opts_decoder[] = { + {"max_queue_depth", offsetof(struct spdk_nvmf_tgt_opts, max_queue_depth), spdk_json_decode_uint16, true}, + {"max_qpairs_per_ctrlr", offsetof(struct spdk_nvmf_tgt_opts, max_qpairs_per_ctrlr), spdk_json_decode_uint16, true}, + {"in_capsule_data_size", offsetof(struct spdk_nvmf_tgt_opts, in_capsule_data_size), spdk_json_decode_uint32, true}, + {"max_io_size", offsetof(struct spdk_nvmf_tgt_opts, max_io_size), spdk_json_decode_uint32, true}, + {"max_subsystems", offsetof(struct spdk_nvmf_tgt_opts, max_subsystems), spdk_json_decode_uint32, true}, + {"io_unit_size", offsetof(struct spdk_nvmf_tgt_opts, io_unit_size), spdk_json_decode_uint32, true}, +}; + +static void +nvmf_rpc_subsystem_set_tgt_opts(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_nvmf_tgt_opts *opts; + struct spdk_json_write_ctx *w; + + if (g_spdk_nvmf_tgt_opts != NULL) { + SPDK_ERRLOG("this RPC must not be called more than once.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Must not call more than once"); + return; + } + + opts = calloc(1, sizeof(*opts)); + if (opts == NULL) { + SPDK_ERRLOG("malloc() failed for target options\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + spdk_nvmf_tgt_opts_init(opts); + + if (params != NULL) { + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_opts_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_opts_decoder), opts)) { + free(opts); + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + g_spdk_nvmf_tgt_opts = opts; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_nvmf_target_options", nvmf_rpc_subsystem_set_tgt_opts, SPDK_RPC_STARTUP) + +static int decode_conn_sched(const struct spdk_json_val *val, void *out) +{ + enum spdk_nvmf_connect_sched *sched = out; + + if (spdk_json_strequal(val, "roundrobin") == true) { + *sched = CONNECT_SCHED_ROUND_ROBIN; + } else if (spdk_json_strequal(val, "hostip") == true) { + *sched = CONNECT_SCHED_HOST_IP; + } else { + SPDK_ERRLOG("Invalid connection scheduling parameter\n"); + return -EINVAL; + } + + return 0; +} + +static const struct spdk_json_object_decoder nvmf_rpc_subsystem_tgt_conf_decoder[] = { + {"acceptor_poll_rate", offsetof(struct spdk_nvmf_tgt_conf, acceptor_poll_rate), spdk_json_decode_uint32, true}, + {"conn_sched", offsetof(struct spdk_nvmf_tgt_conf, conn_sched), decode_conn_sched, true}, +}; + +static void +nvmf_rpc_subsystem_set_tgt_conf(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_nvmf_tgt_conf *conf; + struct spdk_json_write_ctx *w; + + if (g_spdk_nvmf_tgt_conf != NULL) { + SPDK_ERRLOG("this RPC must not be called more than once.\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Must not call more than once"); + return; + } + + conf = calloc(1, sizeof(*conf)); + if (conf == NULL) { + SPDK_ERRLOG("calloc() failed for target config\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Out of memory"); + return; + } + + conf->acceptor_poll_rate = ACCEPT_TIMEOUT_US; + conf->conn_sched = DEFAULT_CONN_SCHED; + + if (params != NULL) { + if (spdk_json_decode_object(params, nvmf_rpc_subsystem_tgt_conf_decoder, + SPDK_COUNTOF(nvmf_rpc_subsystem_tgt_conf_decoder), conf)) { + free(conf); + SPDK_ERRLOG("spdk_json_decode_object() failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + g_spdk_nvmf_tgt_conf = conf; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_nvmf_target_config", nvmf_rpc_subsystem_set_tgt_conf, SPDK_RPC_STARTUP) + +struct nvmf_rpc_create_transport_ctx { + char *trtype; + struct spdk_nvmf_transport_opts opts; + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder nvmf_rpc_create_transport_decoder[] = { + { "trtype", offsetof(struct nvmf_rpc_create_transport_ctx, trtype), spdk_json_decode_string}, + { + "max_queue_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_queue_depth), + spdk_json_decode_uint16, true + }, + { + "max_qpairs_per_ctrlr", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_qpairs_per_ctrlr), + spdk_json_decode_uint16, true + }, + { + "in_capsule_data_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.in_capsule_data_size), + spdk_json_decode_uint32, true + }, + { + "max_io_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_io_size), + spdk_json_decode_uint32, true + }, + { + "io_unit_size", offsetof(struct nvmf_rpc_create_transport_ctx, opts.io_unit_size), + spdk_json_decode_uint32, true + }, + { + "max_aq_depth", offsetof(struct nvmf_rpc_create_transport_ctx, opts.max_aq_depth), + spdk_json_decode_uint32, true + }, +}; + +static void +nvmf_rpc_create_transport_ctx_free(struct nvmf_rpc_create_transport_ctx *ctx) +{ + free(ctx->trtype); + free(ctx); +} + +static void +nvmf_rpc_tgt_add_transport_done(void *cb_arg, int status) +{ + struct nvmf_rpc_create_transport_ctx *ctx = cb_arg; + struct spdk_jsonrpc_request *request; + struct spdk_json_write_ctx *w; + + request = ctx->request; + nvmf_rpc_create_transport_ctx_free(ctx); + + if (status) { + SPDK_ERRLOG("Failed to add transport to tgt.(%d)\n", status); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Failed to add transport to tgt.(%d)\n", + status); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static void +nvmf_rpc_create_transport(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct nvmf_rpc_create_transport_ctx *ctx; + enum spdk_nvme_transport_type trtype; + struct spdk_nvmf_transport *transport; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + return; + } + + /* Decode parameters the first time to get the transport type */ + if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder, + SPDK_COUNTOF(nvmf_rpc_create_transport_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + if (spdk_nvme_transport_id_parse_trtype(&trtype, ctx->trtype)) { + SPDK_ERRLOG("Invalid transport type '%s'\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid transport type '%s'\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + /* Initialize all the transport options (based on transport type) and decode the + * parameters again to update any options passed in rpc create transport call. + */ + spdk_nvmf_transport_opts_init(trtype, &ctx->opts); + if (spdk_json_decode_object(params, nvmf_rpc_create_transport_decoder, + SPDK_COUNTOF(nvmf_rpc_create_transport_decoder), + ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + if (spdk_nvmf_tgt_get_transport(g_spdk_nvmf_tgt, trtype)) { + SPDK_ERRLOG("Transport type '%s' already exists\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Transport type '%s' already exists\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + transport = spdk_nvmf_transport_create(trtype, &ctx->opts); + + if (!transport) { + SPDK_ERRLOG("Transport type '%s' create failed\n", ctx->trtype); + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + "Transport type '%s' create failed\n", ctx->trtype); + nvmf_rpc_create_transport_ctx_free(ctx); + return; + } + + /* add transport to target */ + ctx->request = request; + spdk_nvmf_tgt_add_transport(g_spdk_nvmf_tgt, transport, nvmf_rpc_tgt_add_transport_done, ctx); +} + +SPDK_RPC_REGISTER("nvmf_create_transport", nvmf_rpc_create_transport, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc_deprecated.c b/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc_deprecated.c new file mode 100644 index 00000000..30e5d04c --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/nvmf_rpc_deprecated.c @@ -0,0 +1,620 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "event_nvmf.h" + +#include "spdk/bdev.h" +#include "spdk/log.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include "spdk/string.h" +#include "spdk/util.h" + +static int +hex_nybble_to_num(char c) +{ + if (c >= '0' && c <= '9') { + return c - '0'; + } + + if (c >= 'a' && c <= 'f') { + return c - 'a' + 0xA; + } + + if (c >= 'A' && c <= 'F') { + return c - 'A' + 0xA; + } + + return -1; +} + +static int +hex_byte_to_num(const char *str) +{ + int hi, lo; + + hi = hex_nybble_to_num(str[0]); + if (hi < 0) { + return hi; + } + + lo = hex_nybble_to_num(str[1]); + if (lo < 0) { + return lo; + } + + return hi * 16 + lo; +} + +static int +decode_hex_string_be(const char *str, uint8_t *out, size_t size) +{ + size_t i; + + /* Decode a string in "ABCDEF012345" format to its binary representation */ + for (i = 0; i < size; i++) { + int num = hex_byte_to_num(str); + + if (num < 0) { + /* Invalid hex byte or end of string */ + return -1; + } + + out[i] = (uint8_t)num; + str += 2; + } + + if (i != size || *str != '\0') { + /* Length mismatch */ + return -1; + } + + return 0; +} + +static int +decode_ns_nguid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 16-byte NGUID */ + rc = decode_hex_string_be(str, out, 16); + } + + free(str); + return rc; +} + +static int +decode_ns_eui64(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + /* 8-byte EUI-64 */ + rc = decode_hex_string_be(str, out, 8); + } + + free(str); + return rc; +} + +static int +decode_ns_uuid(const struct spdk_json_val *val, void *out) +{ + char *str = NULL; + int rc; + + rc = spdk_json_decode_string(val, &str); + if (rc == 0) { + rc = spdk_uuid_parse(out, str); + } + + free(str); + return rc; +} + +struct rpc_listen_address { + char *transport; + char *adrfam; + char *traddr; + char *trsvcid; +}; + +#define RPC_MAX_LISTEN_ADDRESSES 255 +#define RPC_MAX_HOSTS 255 +#define RPC_MAX_NAMESPACES 255 + +struct rpc_listen_addresses { + size_t num_listen_address; + struct rpc_listen_address addresses[RPC_MAX_LISTEN_ADDRESSES]; +}; + +static const struct spdk_json_object_decoder rpc_listen_address_decoders[] = { + /* NOTE: "transport" is kept for compatibility; new code should use "trtype" */ + {"transport", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"trtype", offsetof(struct rpc_listen_address, transport), spdk_json_decode_string, true}, + {"adrfam", offsetof(struct rpc_listen_address, adrfam), spdk_json_decode_string, true}, + {"traddr", offsetof(struct rpc_listen_address, traddr), spdk_json_decode_string}, + {"trsvcid", offsetof(struct rpc_listen_address, trsvcid), spdk_json_decode_string}, +}; + +static int +decode_rpc_listen_address(const struct spdk_json_val *val, void *out) +{ + struct rpc_listen_address *req = (struct rpc_listen_address *)out; + if (spdk_json_decode_object(val, rpc_listen_address_decoders, + SPDK_COUNTOF(rpc_listen_address_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + return -1; + } + return 0; +} + +static void +free_rpc_listen_address(struct rpc_listen_address *r) +{ + free(r->transport); + free(r->adrfam); + free(r->traddr); + free(r->trsvcid); +} + +static int +rpc_listen_address_to_trid(const struct rpc_listen_address *address, + struct spdk_nvme_transport_id *trid) +{ + size_t len; + + memset(trid, 0, sizeof(*trid)); + + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, address->transport)) { + SPDK_ERRLOG("Invalid transport type: %s\n", address->transport); + return -EINVAL; + } + + if (address->adrfam) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, address->adrfam)) { + SPDK_ERRLOG("Invalid adrfam: %s\n", address->adrfam); + return -EINVAL; + } + } else { + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + } + + len = strlen(address->traddr); + if (len > sizeof(trid->traddr) - 1) { + SPDK_ERRLOG("Transport address longer than %zu characters: %s\n", + sizeof(trid->traddr) - 1, address->traddr); + return -EINVAL; + } + memcpy(trid->traddr, address->traddr, len + 1); + + len = strlen(address->trsvcid); + if (len > sizeof(trid->trsvcid) - 1) { + SPDK_ERRLOG("Transport service id longer than %zu characters: %s\n", + sizeof(trid->trsvcid) - 1, address->trsvcid); + return -EINVAL; + } + memcpy(trid->trsvcid, address->trsvcid, len + 1); + + return 0; +} + +static int +decode_rpc_listen_addresses(const struct spdk_json_val *val, void *out) +{ + struct rpc_listen_addresses *listen_addresses = out; + return spdk_json_decode_array(val, decode_rpc_listen_address, &listen_addresses->addresses, + RPC_MAX_LISTEN_ADDRESSES, + &listen_addresses->num_listen_address, sizeof(struct rpc_listen_address)); +} + +struct rpc_hosts { + size_t num_hosts; + char *hosts[RPC_MAX_HOSTS]; +}; + +static int +decode_rpc_hosts(const struct spdk_json_val *val, void *out) +{ + struct rpc_hosts *rpc_hosts = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, rpc_hosts->hosts, RPC_MAX_HOSTS, + &rpc_hosts->num_hosts, sizeof(char *)); +} + + +struct spdk_nvmf_ns_params { + char *bdev_name; + uint32_t nsid; + char nguid[16]; + char eui64[8]; + struct spdk_uuid uuid; +}; + +struct rpc_namespaces { + size_t num_ns; + struct spdk_nvmf_ns_params ns_params[RPC_MAX_NAMESPACES]; +}; + + +static const struct spdk_json_object_decoder rpc_ns_params_decoders[] = { + {"nsid", offsetof(struct spdk_nvmf_ns_params, nsid), spdk_json_decode_uint32, true}, + {"bdev_name", offsetof(struct spdk_nvmf_ns_params, bdev_name), spdk_json_decode_string}, + {"nguid", offsetof(struct spdk_nvmf_ns_params, nguid), decode_ns_nguid, true}, + {"eui64", offsetof(struct spdk_nvmf_ns_params, eui64), decode_ns_eui64, true}, + {"uuid", offsetof(struct spdk_nvmf_ns_params, uuid), decode_ns_uuid, true}, +}; + +static void +free_rpc_ns_params(struct spdk_nvmf_ns_params *ns_params) +{ + free(ns_params->bdev_name); +} + +static void +free_rpc_namespaces(struct rpc_namespaces *r) +{ + size_t i; + + for (i = 0; i < r->num_ns; i++) { + free_rpc_ns_params(&r->ns_params[i]); + } +} + +static int +decode_rpc_ns_params(const struct spdk_json_val *val, void *out) +{ + struct spdk_nvmf_ns_params *ns_params = out; + + return spdk_json_decode_object(val, rpc_ns_params_decoders, + SPDK_COUNTOF(rpc_ns_params_decoders), + ns_params); +} + +static int +decode_rpc_namespaces(const struct spdk_json_val *val, void *out) +{ + struct rpc_namespaces *namespaces = out; + char *names[RPC_MAX_NAMESPACES] = {0}; /* old format - array of strings (bdev names) */ + size_t i; + int rc; + + /* First try to decode namespaces as an array of objects (new format) */ + if (spdk_json_decode_array(val, decode_rpc_ns_params, namespaces->ns_params, + SPDK_COUNTOF(namespaces->ns_params), + &namespaces->num_ns, sizeof(*namespaces->ns_params)) == 0) { + return 0; + } + + /* If that fails, try to decode namespaces as an array of strings (old format) */ + free_rpc_namespaces(namespaces); + memset(namespaces, 0, sizeof(*namespaces)); + rc = spdk_json_decode_array(val, spdk_json_decode_string, names, + SPDK_COUNTOF(names), + &namespaces->num_ns, sizeof(char *)); + if (rc == 0) { + /* Decoded old format - copy to ns_params (new format) */ + for (i = 0; i < namespaces->num_ns; i++) { + namespaces->ns_params[i].bdev_name = names[i]; + } + return 0; + } + + /* Failed to decode - don't leave dangling string pointers around */ + for (i = 0; i < namespaces->num_ns; i++) { + free(names[i]); + } + + return rc; +} + +static void +free_rpc_listen_addresses(struct rpc_listen_addresses *r) +{ + size_t i; + + for (i = 0; i < r->num_listen_address; i++) { + free_rpc_listen_address(&r->addresses[i]); + } +} + +static void +free_rpc_hosts(struct rpc_hosts *r) +{ + size_t i; + + for (i = 0; i < r->num_hosts; i++) { + free(r->hosts[i]); + } +} + +struct rpc_subsystem { + int32_t core; + char *mode; + char *nqn; + struct rpc_listen_addresses listen_addresses; + struct rpc_hosts hosts; + bool allow_any_host; + char *pci_address; + char *serial_number; + struct rpc_namespaces namespaces; + uint32_t num_ns; +}; + +static void +free_rpc_subsystem(struct rpc_subsystem *req) +{ + if (req) { + free(req->mode); + free(req->nqn); + free(req->serial_number); + free_rpc_namespaces(&req->namespaces); + free_rpc_listen_addresses(&req->listen_addresses); + free_rpc_hosts(&req->hosts); + } + free(req); +} + +static void +spdk_rpc_nvmf_subsystem_started(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_jsonrpc_request *request = cb_arg; + struct spdk_json_write_ctx *w; + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} + +static const struct spdk_json_object_decoder rpc_subsystem_decoders[] = { + {"core", offsetof(struct rpc_subsystem, core), spdk_json_decode_int32, true}, + {"mode", offsetof(struct rpc_subsystem, mode), spdk_json_decode_string, true}, + {"nqn", offsetof(struct rpc_subsystem, nqn), spdk_json_decode_string}, + {"listen_addresses", offsetof(struct rpc_subsystem, listen_addresses), decode_rpc_listen_addresses, true}, + {"hosts", offsetof(struct rpc_subsystem, hosts), decode_rpc_hosts, true}, + {"allow_any_host", offsetof(struct rpc_subsystem, allow_any_host), spdk_json_decode_bool, true}, + {"serial_number", offsetof(struct rpc_subsystem, serial_number), spdk_json_decode_string, true}, + {"namespaces", offsetof(struct rpc_subsystem, namespaces), decode_rpc_namespaces, true}, + {"max_namespaces", offsetof(struct rpc_subsystem, num_ns), spdk_json_decode_uint32, true}, +}; + +struct subsystem_listen_ctx { + struct rpc_subsystem *req; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_jsonrpc_request *request; + + uint32_t idx; +}; + +static void +spdk_rpc_construct_subsystem_listen_done(void *cb_arg, int status) +{ + struct subsystem_listen_ctx *ctx = cb_arg; + struct rpc_listen_address *addr; + struct spdk_nvme_transport_id trid = {0}; + + if (status) { + goto invalid; + } + + addr = &ctx->req->listen_addresses.addresses[ctx->idx]; + if (rpc_listen_address_to_trid(addr, &trid)) { + goto invalid; + } + + spdk_nvmf_subsystem_add_listener(ctx->subsystem, &trid); + + ctx->idx++; + + if (ctx->idx < ctx->req->listen_addresses.num_listen_address) { + addr = &ctx->req->listen_addresses.addresses[ctx->idx]; + + if (rpc_listen_address_to_trid(addr, &trid)) { + goto invalid; + } + + spdk_nvmf_tgt_listen(g_spdk_nvmf_tgt, &trid, spdk_rpc_construct_subsystem_listen_done, ctx); + return; + } + + spdk_nvmf_subsystem_start(ctx->subsystem, + spdk_rpc_nvmf_subsystem_started, + ctx->request); + + free_rpc_subsystem(ctx->req); + free(ctx); + + return; + +invalid: + spdk_nvmf_subsystem_destroy(ctx->subsystem); + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_subsystem(ctx->req); + free(ctx); +} + +static void +spdk_rpc_construct_nvmf_subsystem(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_subsystem *req; + struct spdk_nvmf_subsystem *subsystem; + size_t i; + + SPDK_WARNLOG("The construct_nvmf_subsystem RPC is deprecated. Use nvmf_subsystem_create instead.\n"); + + req = calloc(1, sizeof(*req)); + if (!req) { + goto invalid; + } + + req->core = -1; /* Explicitly set the core as the uninitialized value */ + + if (spdk_json_decode_object(params, rpc_subsystem_decoders, + SPDK_COUNTOF(rpc_subsystem_decoders), + req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + /* Mode is no longer a valid parameter, but print out a nice + * message if it exists to inform users. + */ + if (req->mode) { + SPDK_NOTICELOG("Mode present in the construct NVMe-oF subsystem RPC.\n" + "Mode was removed as a valid parameter.\n"); + if (strcasecmp(req->mode, "Virtual") == 0) { + SPDK_NOTICELOG("Your mode value is 'Virtual' which is now the only possible mode.\n" + "Your RPC will work as expected.\n"); + } else { + SPDK_NOTICELOG("Please remove 'mode' from the RPC.\n"); + goto invalid; + } + } + + /* Core is no longer a valid parameter, but print out a nice + * message if it exists to inform users. + */ + if (req->core != -1) { + SPDK_NOTICELOG("Core present in the construct NVMe-oF subsystem RPC.\n" + "Core was removed as an option. Subsystems can now run on all available cores.\n"); + SPDK_NOTICELOG("Ignoring it and continuing.\n"); + } + + subsystem = spdk_nvmf_subsystem_create(g_spdk_nvmf_tgt, req->nqn, SPDK_NVMF_SUBTYPE_NVME, + req->num_ns); + if (!subsystem) { + goto invalid; + } + + if (spdk_nvmf_subsystem_set_sn(subsystem, req->serial_number)) { + SPDK_ERRLOG("Subsystem %s: invalid serial number '%s'\n", req->nqn, req->serial_number); + goto invalid; + } + + for (i = 0; i < req->hosts.num_hosts; i++) { + spdk_nvmf_subsystem_add_host(subsystem, req->hosts.hosts[i]); + } + + spdk_nvmf_subsystem_set_allow_any_host(subsystem, req->allow_any_host); + + for (i = 0; i < req->namespaces.num_ns; i++) { + struct spdk_nvmf_ns_params *ns_params = &req->namespaces.ns_params[i]; + struct spdk_bdev *bdev; + struct spdk_nvmf_ns_opts ns_opts; + + bdev = spdk_bdev_get_by_name(ns_params->bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("Could not find namespace bdev '%s'\n", ns_params->bdev_name); + spdk_nvmf_subsystem_destroy(subsystem); + goto invalid; + } + + spdk_nvmf_ns_opts_get_defaults(&ns_opts, sizeof(ns_opts)); + ns_opts.nsid = ns_params->nsid; + + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(ns_params->nguid), "size mismatch"); + memcpy(ns_opts.nguid, ns_params->nguid, sizeof(ns_opts.nguid)); + + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(ns_params->eui64), "size mismatch"); + memcpy(ns_opts.eui64, ns_params->eui64, sizeof(ns_opts.eui64)); + + if (!spdk_mem_all_zero(&ns_params->uuid, sizeof(ns_params->uuid))) { + ns_opts.uuid = ns_params->uuid; + } + + if (spdk_nvmf_subsystem_add_ns(subsystem, bdev, &ns_opts, sizeof(ns_opts)) == 0) { + SPDK_ERRLOG("Unable to add namespace\n"); + spdk_nvmf_subsystem_destroy(subsystem); + goto invalid; + } + } + + if (req->listen_addresses.num_listen_address > 0) { + struct rpc_listen_address *addr; + struct spdk_nvme_transport_id trid = {0}; + struct subsystem_listen_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_nvmf_subsystem_destroy(subsystem); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "No Memory"); + free_rpc_subsystem(req); + return; + } + + ctx->req = req; + ctx->subsystem = subsystem; + ctx->request = request; + ctx->idx = 0; + + addr = &req->listen_addresses.addresses[0]; + + if (rpc_listen_address_to_trid(addr, &trid)) { + free(ctx); + goto invalid; + } + + spdk_nvmf_tgt_listen(g_spdk_nvmf_tgt, &trid, spdk_rpc_construct_subsystem_listen_done, ctx); + return; + } + + free_rpc_subsystem(req); + + spdk_nvmf_subsystem_start(subsystem, + spdk_rpc_nvmf_subsystem_started, + request); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_subsystem(req); +} +SPDK_RPC_REGISTER("construct_nvmf_subsystem", spdk_rpc_construct_nvmf_subsystem, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/event/subsystems/nvmf/nvmf_tgt.c b/src/spdk/lib/event/subsystems/nvmf/nvmf_tgt.c new file mode 100644 index 00000000..bb35dcce --- /dev/null +++ b/src/spdk/lib/event/subsystems/nvmf/nvmf_tgt.c @@ -0,0 +1,503 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "event_nvmf.h" + +#include "spdk/bdev.h" +#include "spdk/event.h" +#include "spdk/thread.h" +#include "spdk/log.h" +#include "spdk/nvme.h" +#include "spdk/util.h" + +enum nvmf_tgt_state { + NVMF_TGT_INIT_NONE = 0, + NVMF_TGT_INIT_PARSE_CONFIG, + NVMF_TGT_INIT_CREATE_POLL_GROUPS, + NVMF_TGT_INIT_START_SUBSYSTEMS, + NVMF_TGT_INIT_START_ACCEPTOR, + NVMF_TGT_RUNNING, + NVMF_TGT_FINI_STOP_SUBSYSTEMS, + NVMF_TGT_FINI_DESTROY_POLL_GROUPS, + NVMF_TGT_FINI_STOP_ACCEPTOR, + NVMF_TGT_FINI_FREE_RESOURCES, + NVMF_TGT_STOPPED, + NVMF_TGT_ERROR, +}; + +struct nvmf_tgt_poll_group { + struct spdk_nvmf_poll_group *group; +}; + +struct nvmf_tgt_host_trid { + struct spdk_nvme_transport_id host_trid; + uint32_t core; + uint32_t ref; + TAILQ_ENTRY(nvmf_tgt_host_trid) link; +}; + +/* List of host trids that are connected to the target */ +static TAILQ_HEAD(, nvmf_tgt_host_trid) g_nvmf_tgt_host_trids = + TAILQ_HEAD_INITIALIZER(g_nvmf_tgt_host_trids); + +struct spdk_nvmf_tgt *g_spdk_nvmf_tgt = NULL; + +static enum nvmf_tgt_state g_tgt_state; + +/* Round-Robin/IP-based tracking of cores for qpair assignment */ +static uint32_t g_tgt_core; + +static struct nvmf_tgt_poll_group *g_poll_groups = NULL; +static size_t g_num_poll_groups = 0; + +static struct spdk_poller *g_acceptor_poller = NULL; + +static void nvmf_tgt_advance_state(void); + +static void +_spdk_nvmf_shutdown_cb(void *arg1, void *arg2) +{ + /* Still in initialization state, defer shutdown operation */ + if (g_tgt_state < NVMF_TGT_RUNNING) { + spdk_event_call(spdk_event_allocate(spdk_env_get_current_core(), + _spdk_nvmf_shutdown_cb, NULL, NULL)); + return; + } else if (g_tgt_state > NVMF_TGT_RUNNING) { + /* Already in Shutdown status, ignore the signal */ + return; + } + + g_tgt_state = NVMF_TGT_FINI_STOP_SUBSYSTEMS; + nvmf_tgt_advance_state(); +} + +static void +spdk_nvmf_subsystem_fini(void) +{ + /* Always let the first core to handle the case */ + if (spdk_env_get_current_core() != spdk_env_get_first_core()) { + spdk_event_call(spdk_event_allocate(spdk_env_get_first_core(), + _spdk_nvmf_shutdown_cb, NULL, NULL)); + } else { + _spdk_nvmf_shutdown_cb(NULL, NULL); + } +} + +static void +nvmf_tgt_poll_group_add(void *arg1, void *arg2) +{ + struct spdk_nvmf_qpair *qpair = arg1; + struct nvmf_tgt_poll_group *pg = arg2; + + spdk_nvmf_poll_group_add(pg->group, qpair); +} + +/* Round robin selection of cores */ +static uint32_t +spdk_nvmf_get_core_rr(void) +{ + uint32_t core; + + core = g_tgt_core; + g_tgt_core = spdk_env_get_next_core(core); + if (g_tgt_core == UINT32_MAX) { + g_tgt_core = spdk_env_get_first_core(); + } + + return core; +} + +static void +nvmf_tgt_remove_host_trid(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvme_transport_id trid_to_remove; + struct nvmf_tgt_host_trid *trid = NULL, *tmp_trid = NULL; + + if (g_spdk_nvmf_tgt_conf->conn_sched != CONNECT_SCHED_HOST_IP) { + return; + } + + if (spdk_nvmf_qpair_get_peer_trid(qpair, &trid_to_remove) != 0) { + return; + } + + TAILQ_FOREACH_SAFE(trid, &g_nvmf_tgt_host_trids, link, tmp_trid) { + if (trid && !strncmp(trid->host_trid.traddr, + trid_to_remove.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1)) { + trid->ref--; + if (trid->ref == 0) { + TAILQ_REMOVE(&g_nvmf_tgt_host_trids, trid, link); + free(trid); + } + + break; + } + } + + return; +} + +static uint32_t +nvmf_tgt_get_qpair_core(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvme_transport_id trid; + struct nvmf_tgt_host_trid *tmp_trid = NULL, *new_trid = NULL; + int ret; + uint32_t core = 0; + + switch (g_spdk_nvmf_tgt_conf->conn_sched) { + case CONNECT_SCHED_HOST_IP: + ret = spdk_nvmf_qpair_get_peer_trid(qpair, &trid); + if (ret) { + SPDK_ERRLOG("Invalid host transport Id. Assigning to core %d\n", core); + break; + } + + TAILQ_FOREACH(tmp_trid, &g_nvmf_tgt_host_trids, link) { + if (tmp_trid && !strncmp(tmp_trid->host_trid.traddr, + trid.traddr, SPDK_NVMF_TRADDR_MAX_LEN + 1)) { + tmp_trid->ref++; + core = tmp_trid->core; + break; + } + } + if (!tmp_trid) { + new_trid = calloc(1, sizeof(*new_trid)); + if (!new_trid) { + SPDK_ERRLOG("Insufficient memory. Assigning to core %d\n", core); + break; + } + /* Get the next available core for the new host */ + core = spdk_nvmf_get_core_rr(); + new_trid->core = core; + memcpy(new_trid->host_trid.traddr, trid.traddr, + SPDK_NVMF_TRADDR_MAX_LEN + 1); + TAILQ_INSERT_TAIL(&g_nvmf_tgt_host_trids, new_trid, link); + } + break; + case CONNECT_SCHED_ROUND_ROBIN: + default: + core = spdk_nvmf_get_core_rr(); + break; + } + + return core; +} + +static void +new_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_event *event; + struct nvmf_tgt_poll_group *pg; + uint32_t core; + uint32_t attempts; + + if (g_tgt_state != NVMF_TGT_RUNNING) { + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + return; + } + + for (attempts = 0; attempts < g_num_poll_groups; attempts++) { + core = nvmf_tgt_get_qpair_core(qpair); + pg = &g_poll_groups[core]; + if (pg->group != NULL) { + break; + } else { + nvmf_tgt_remove_host_trid(qpair); + } + } + + if (attempts == g_num_poll_groups) { + SPDK_ERRLOG("No poll groups exist.\n"); + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); + return; + } + + event = spdk_event_allocate(core, nvmf_tgt_poll_group_add, qpair, pg); + spdk_event_call(event); +} + +static int +acceptor_poll(void *arg) +{ + struct spdk_nvmf_tgt *tgt = arg; + + spdk_nvmf_tgt_accept(tgt, new_qpair); + + return -1; +} + +static void +nvmf_tgt_destroy_poll_group_done(void *ctx) +{ + g_tgt_state = NVMF_TGT_FINI_STOP_ACCEPTOR; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_destroy_poll_group(void *ctx) +{ + struct nvmf_tgt_poll_group *pg; + + pg = &g_poll_groups[spdk_env_get_current_core()]; + + if (pg->group) { + spdk_nvmf_poll_group_destroy(pg->group); + pg->group = NULL; + } +} + +static void +nvmf_tgt_create_poll_group_done(void *ctx) +{ + g_tgt_state = NVMF_TGT_INIT_START_SUBSYSTEMS; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_create_poll_group(void *ctx) +{ + struct nvmf_tgt_poll_group *pg; + + pg = &g_poll_groups[spdk_env_get_current_core()]; + + pg->group = spdk_nvmf_poll_group_create(g_spdk_nvmf_tgt); +} + +static void +nvmf_tgt_subsystem_started(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + + if (subsystem) { + spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + return; + } + + g_tgt_state = NVMF_TGT_INIT_START_ACCEPTOR; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_subsystem_stopped(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + + if (subsystem) { + spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + return; + } + + g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_destroy_done(void *ctx, int status) +{ + struct nvmf_tgt_host_trid *trid, *tmp_trid; + + g_tgt_state = NVMF_TGT_STOPPED; + + TAILQ_FOREACH_SAFE(trid, &g_nvmf_tgt_host_trids, link, tmp_trid) { + TAILQ_REMOVE(&g_nvmf_tgt_host_trids, trid, link); + free(trid); + } + + free(g_spdk_nvmf_tgt_conf); + g_spdk_nvmf_tgt_conf = NULL; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_parse_conf_done(int status) +{ + g_tgt_state = (status == 0) ? NVMF_TGT_INIT_CREATE_POLL_GROUPS : NVMF_TGT_ERROR; + nvmf_tgt_advance_state(); +} + +static void +nvmf_tgt_parse_conf_start(void *ctx) +{ + if (spdk_nvmf_parse_conf(nvmf_tgt_parse_conf_done)) { + SPDK_ERRLOG("spdk_nvmf_parse_conf() failed\n"); + g_tgt_state = NVMF_TGT_ERROR; + nvmf_tgt_advance_state(); + } +} + +static void +nvmf_tgt_advance_state(void) +{ + enum nvmf_tgt_state prev_state; + int rc = -1; + + do { + prev_state = g_tgt_state; + + switch (g_tgt_state) { + case NVMF_TGT_INIT_NONE: { + g_tgt_state = NVMF_TGT_INIT_PARSE_CONFIG; + + /* Find the maximum core number */ + g_num_poll_groups = spdk_env_get_last_core() + 1; + assert(g_num_poll_groups > 0); + + g_poll_groups = calloc(g_num_poll_groups, sizeof(*g_poll_groups)); + if (g_poll_groups == NULL) { + g_tgt_state = NVMF_TGT_ERROR; + rc = -ENOMEM; + break; + } + + g_tgt_core = spdk_env_get_first_core(); + break; + } + case NVMF_TGT_INIT_PARSE_CONFIG: + /* Send message to self to call parse conf func. + * Prevents it from possibly performing cb before getting + * out of this function, which causes problems. */ + spdk_thread_send_msg(spdk_get_thread(), nvmf_tgt_parse_conf_start, NULL); + break; + case NVMF_TGT_INIT_CREATE_POLL_GROUPS: + /* Send a message to each thread and create a poll group */ + spdk_for_each_thread(nvmf_tgt_create_poll_group, + NULL, + nvmf_tgt_create_poll_group_done); + break; + case NVMF_TGT_INIT_START_SUBSYSTEMS: { + struct spdk_nvmf_subsystem *subsystem; + + subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); + + if (subsystem) { + spdk_nvmf_subsystem_start(subsystem, nvmf_tgt_subsystem_started, NULL); + } else { + g_tgt_state = NVMF_TGT_INIT_START_ACCEPTOR; + } + break; + } + case NVMF_TGT_INIT_START_ACCEPTOR: + g_acceptor_poller = spdk_poller_register(acceptor_poll, g_spdk_nvmf_tgt, + g_spdk_nvmf_tgt_conf->acceptor_poll_rate); + SPDK_INFOLOG(SPDK_LOG_NVMF, "Acceptor running\n"); + g_tgt_state = NVMF_TGT_RUNNING; + break; + case NVMF_TGT_RUNNING: + spdk_subsystem_init_next(0); + break; + case NVMF_TGT_FINI_STOP_SUBSYSTEMS: { + struct spdk_nvmf_subsystem *subsystem; + + subsystem = spdk_nvmf_subsystem_get_first(g_spdk_nvmf_tgt); + + if (subsystem) { + spdk_nvmf_subsystem_stop(subsystem, nvmf_tgt_subsystem_stopped, NULL); + } else { + g_tgt_state = NVMF_TGT_FINI_DESTROY_POLL_GROUPS; + } + break; + } + case NVMF_TGT_FINI_DESTROY_POLL_GROUPS: + /* Send a message to each thread and destroy the poll group */ + spdk_for_each_thread(nvmf_tgt_destroy_poll_group, + NULL, + nvmf_tgt_destroy_poll_group_done); + break; + case NVMF_TGT_FINI_STOP_ACCEPTOR: + spdk_poller_unregister(&g_acceptor_poller); + g_tgt_state = NVMF_TGT_FINI_FREE_RESOURCES; + break; + case NVMF_TGT_FINI_FREE_RESOURCES: + spdk_nvmf_tgt_destroy(g_spdk_nvmf_tgt, nvmf_tgt_destroy_done, NULL); + break; + case NVMF_TGT_STOPPED: + spdk_subsystem_fini_next(); + return; + case NVMF_TGT_ERROR: + spdk_subsystem_init_next(rc); + return; + } + + } while (g_tgt_state != prev_state); +} + +static void +spdk_nvmf_subsystem_init(void) +{ + g_tgt_state = NVMF_TGT_INIT_NONE; + nvmf_tgt_advance_state(); +} + +static char * +get_conn_sched_string(enum spdk_nvmf_connect_sched sched) +{ + if (sched == CONNECT_SCHED_HOST_IP) { + return "hostip"; + } else { + return "roundrobin"; + } +} + +static void +spdk_nvmf_subsystem_write_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev) +{ + spdk_json_write_array_begin(w); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_nvmf_target_config"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint32(w, "acceptor_poll_rate", g_spdk_nvmf_tgt_conf->acceptor_poll_rate); + spdk_json_write_named_string(w, "conn_sched", + get_conn_sched_string(g_spdk_nvmf_tgt_conf->conn_sched)); + spdk_json_write_object_end(w); + spdk_json_write_object_end(w); + + spdk_nvmf_tgt_write_config_json(w, g_spdk_nvmf_tgt); + spdk_json_write_array_end(w); + + spdk_event_call(done_ev); +} + +static struct spdk_subsystem g_spdk_subsystem_nvmf = { + .name = "nvmf", + .init = spdk_nvmf_subsystem_init, + .fini = spdk_nvmf_subsystem_fini, + .write_config_json = spdk_nvmf_subsystem_write_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_nvmf) +SPDK_SUBSYSTEM_DEPEND(nvmf, bdev) diff --git a/src/spdk/lib/event/subsystems/scsi/Makefile b/src/spdk/lib/event/subsystems/scsi/Makefile new file mode 100644 index 00000000..12bf15e3 --- /dev/null +++ b/src/spdk/lib/event/subsystems/scsi/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = scsi.c +LIBNAME = event_scsi + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/scsi/scsi.c b/src/spdk/lib/event/subsystems/scsi/scsi.c new file mode 100644 index 00000000..a37ebf61 --- /dev/null +++ b/src/spdk/lib/event/subsystems/scsi/scsi.c @@ -0,0 +1,65 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/scsi.h" + +#include "spdk_internal/event.h" + +static void +spdk_scsi_subsystem_init(void) +{ + int rc; + + rc = spdk_scsi_init(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_scsi_subsystem_fini(void) +{ + spdk_scsi_fini(); + spdk_subsystem_fini_next(); +} + +static struct spdk_subsystem g_spdk_subsystem_scsi = { + .name = "scsi", + .init = spdk_scsi_subsystem_init, + .fini = spdk_scsi_subsystem_fini, + .config = NULL, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_scsi); +SPDK_SUBSYSTEM_DEPEND(scsi, bdev) diff --git a/src/spdk/lib/event/subsystems/vhost/Makefile b/src/spdk/lib/event/subsystems/vhost/Makefile new file mode 100644 index 00000000..2e0d61fe --- /dev/null +++ b/src/spdk/lib/event/subsystems/vhost/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = vhost.c +LIBNAME = event_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/event/subsystems/vhost/vhost.c b/src/spdk/lib/event/subsystems/vhost/vhost.c new file mode 100644 index 00000000..1fdbc6aa --- /dev/null +++ b/src/spdk/lib/event/subsystems/vhost/vhost.c @@ -0,0 +1,71 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/vhost.h" + +#include "spdk_internal/event.h" + +static void +spdk_vhost_subsystem_init(void) +{ + int rc = 0; + + rc = spdk_vhost_init(); + + spdk_subsystem_init_next(rc); +} + +static void +spdk_vhost_subsystem_fini_done(void) +{ + spdk_subsystem_fini_next(); +} + +static void +spdk_vhost_subsystem_fini(void) +{ + spdk_vhost_fini(spdk_vhost_subsystem_fini_done); +} + +static struct spdk_subsystem g_spdk_subsystem_vhost = { + .name = "vhost", + .init = spdk_vhost_subsystem_init, + .fini = spdk_vhost_subsystem_fini, + .config = NULL, + .write_config_json = spdk_vhost_config_json, +}; + +SPDK_SUBSYSTEM_REGISTER(g_spdk_subsystem_vhost); +SPDK_SUBSYSTEM_DEPEND(vhost, scsi) diff --git a/src/spdk/lib/ioat/Makefile b/src/spdk/lib/ioat/Makefile new file mode 100644 index 00000000..d59d607f --- /dev/null +++ b/src/spdk/lib/ioat/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = ioat.c +LIBNAME = ioat + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ioat/ioat.c b/src/spdk/lib/ioat/ioat.c new file mode 100644 index 00000000..d8c15bf3 --- /dev/null +++ b/src/spdk/lib/ioat/ioat.c @@ -0,0 +1,733 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "ioat_internal.h" + +#include "spdk/env.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct ioat_driver { + pthread_mutex_t lock; + TAILQ_HEAD(, spdk_ioat_chan) attached_chans; +}; + +static struct ioat_driver g_ioat_driver = { + .lock = PTHREAD_MUTEX_INITIALIZER, + .attached_chans = TAILQ_HEAD_INITIALIZER(g_ioat_driver.attached_chans), +}; + +static uint64_t +ioat_get_chansts(struct spdk_ioat_chan *ioat) +{ + return spdk_mmio_read_8(&ioat->regs->chansts); +} + +static void +ioat_write_chancmp(struct spdk_ioat_chan *ioat, uint64_t addr) +{ + spdk_mmio_write_8(&ioat->regs->chancmp, addr); +} + +static void +ioat_write_chainaddr(struct spdk_ioat_chan *ioat, uint64_t addr) +{ + spdk_mmio_write_8(&ioat->regs->chainaddr, addr); +} + +static inline void +ioat_suspend(struct spdk_ioat_chan *ioat) +{ + ioat->regs->chancmd = SPDK_IOAT_CHANCMD_SUSPEND; +} + +static inline void +ioat_reset(struct spdk_ioat_chan *ioat) +{ + ioat->regs->chancmd = SPDK_IOAT_CHANCMD_RESET; +} + +static inline uint32_t +ioat_reset_pending(struct spdk_ioat_chan *ioat) +{ + uint8_t cmd; + + cmd = ioat->regs->chancmd; + return (cmd & SPDK_IOAT_CHANCMD_RESET) == SPDK_IOAT_CHANCMD_RESET; +} + +static int +ioat_map_pci_bar(struct spdk_ioat_chan *ioat) +{ + int regs_bar, rc; + void *addr; + uint64_t phys_addr, size; + + regs_bar = 0; + rc = spdk_pci_device_map_bar(ioat->device, regs_bar, &addr, &phys_addr, &size); + if (rc != 0 || addr == NULL) { + SPDK_ERRLOG("pci_device_map_range failed with error code %d\n", + rc); + return -1; + } + + ioat->regs = (volatile struct spdk_ioat_registers *)addr; + + return 0; +} + +static int +ioat_unmap_pci_bar(struct spdk_ioat_chan *ioat) +{ + int rc = 0; + void *addr = (void *)ioat->regs; + + if (addr) { + rc = spdk_pci_device_unmap_bar(ioat->device, 0, addr); + } + return rc; +} + + +static inline uint32_t +ioat_get_active(struct spdk_ioat_chan *ioat) +{ + return (ioat->head - ioat->tail) & ((1 << ioat->ring_size_order) - 1); +} + +static inline uint32_t +ioat_get_ring_space(struct spdk_ioat_chan *ioat) +{ + return (1 << ioat->ring_size_order) - ioat_get_active(ioat) - 1; +} + +static uint32_t +ioat_get_ring_index(struct spdk_ioat_chan *ioat, uint32_t index) +{ + return index & ((1 << ioat->ring_size_order) - 1); +} + +static void +ioat_get_ring_entry(struct spdk_ioat_chan *ioat, uint32_t index, + struct ioat_descriptor **desc, + union spdk_ioat_hw_desc **hw_desc) +{ + uint32_t i = ioat_get_ring_index(ioat, index); + + *desc = &ioat->ring[i]; + *hw_desc = &ioat->hw_ring[i]; +} + +static void +ioat_submit_single(struct spdk_ioat_chan *ioat) +{ + ioat->head++; +} + +static void +ioat_flush(struct spdk_ioat_chan *ioat) +{ + ioat->regs->dmacount = (uint16_t)ioat->head; +} + +static struct ioat_descriptor * +ioat_prep_null(struct spdk_ioat_chan *ioat) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->dma.u.control_raw = 0; + hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY; + hw_desc->dma.u.control.null = 1; + hw_desc->dma.u.control.completion_update = 1; + + hw_desc->dma.size = 8; + hw_desc->dma.src_addr = 0; + hw_desc->dma.dest_addr = 0; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static struct ioat_descriptor * +ioat_prep_copy(struct spdk_ioat_chan *ioat, uint64_t dst, + uint64_t src, uint32_t len) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + assert(len <= ioat->max_xfer_size); + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->dma.u.control_raw = 0; + hw_desc->dma.u.control.op = SPDK_IOAT_OP_COPY; + hw_desc->dma.u.control.completion_update = 1; + + hw_desc->dma.size = len; + hw_desc->dma.src_addr = src; + hw_desc->dma.dest_addr = dst; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static struct ioat_descriptor * +ioat_prep_fill(struct spdk_ioat_chan *ioat, uint64_t dst, + uint64_t fill_pattern, uint32_t len) +{ + struct ioat_descriptor *desc; + union spdk_ioat_hw_desc *hw_desc; + + assert(len <= ioat->max_xfer_size); + + if (ioat_get_ring_space(ioat) < 1) { + return NULL; + } + + ioat_get_ring_entry(ioat, ioat->head, &desc, &hw_desc); + + hw_desc->fill.u.control_raw = 0; + hw_desc->fill.u.control.op = SPDK_IOAT_OP_FILL; + hw_desc->fill.u.control.completion_update = 1; + + hw_desc->fill.size = len; + hw_desc->fill.src_data = fill_pattern; + hw_desc->fill.dest_addr = dst; + + desc->callback_fn = NULL; + desc->callback_arg = NULL; + + ioat_submit_single(ioat); + + return desc; +} + +static int ioat_reset_hw(struct spdk_ioat_chan *ioat) +{ + int timeout; + uint64_t status; + uint32_t chanerr; + int rc; + + status = ioat_get_chansts(ioat); + if (is_ioat_active(status) || is_ioat_idle(status)) { + ioat_suspend(ioat); + } + + timeout = 20; /* in milliseconds */ + while (is_ioat_active(status) || is_ioat_idle(status)) { + spdk_delay_us(1000); + timeout--; + if (timeout == 0) { + SPDK_ERRLOG("timed out waiting for suspend\n"); + return -1; + } + status = ioat_get_chansts(ioat); + } + + /* + * Clear any outstanding errors. + * CHANERR is write-1-to-clear, so write the current CHANERR bits back to reset everything. + */ + chanerr = ioat->regs->chanerr; + ioat->regs->chanerr = chanerr; + + if (ioat->regs->cbver < SPDK_IOAT_VER_3_3) { + rc = spdk_pci_device_cfg_read32(ioat->device, &chanerr, + SPDK_IOAT_PCI_CHANERR_INT_OFFSET); + if (rc) { + SPDK_ERRLOG("failed to read the internal channel error register\n"); + return -1; + } + + spdk_pci_device_cfg_write32(ioat->device, chanerr, + SPDK_IOAT_PCI_CHANERR_INT_OFFSET); + } + + ioat_reset(ioat); + + timeout = 20; + while (ioat_reset_pending(ioat)) { + spdk_delay_us(1000); + timeout--; + if (timeout == 0) { + SPDK_ERRLOG("timed out waiting for reset\n"); + return -1; + } + } + + return 0; +} + +static int +ioat_process_channel_events(struct spdk_ioat_chan *ioat) +{ + struct ioat_descriptor *desc; + uint64_t status, completed_descriptor, hw_desc_phys_addr; + uint32_t tail; + + if (ioat->head == ioat->tail) { + return 0; + } + + status = *ioat->comp_update; + completed_descriptor = status & SPDK_IOAT_CHANSTS_COMPLETED_DESCRIPTOR_MASK; + + if (is_ioat_halted(status)) { + SPDK_ERRLOG("Channel halted (%x)\n", ioat->regs->chanerr); + return -1; + } + + if (completed_descriptor == ioat->last_seen) { + return 0; + } + + do { + tail = ioat_get_ring_index(ioat, ioat->tail); + desc = &ioat->ring[tail]; + + if (desc->callback_fn) { + desc->callback_fn(desc->callback_arg); + } + + hw_desc_phys_addr = desc->phys_addr; + ioat->tail++; + } while (hw_desc_phys_addr != completed_descriptor); + + ioat->last_seen = hw_desc_phys_addr; + return 0; +} + +static void +ioat_channel_destruct(struct spdk_ioat_chan *ioat) +{ + ioat_unmap_pci_bar(ioat); + + if (ioat->ring) { + free(ioat->ring); + } + + if (ioat->hw_ring) { + spdk_dma_free(ioat->hw_ring); + } + + if (ioat->comp_update) { + spdk_dma_free((void *)ioat->comp_update); + ioat->comp_update = NULL; + } +} + +static int +ioat_channel_start(struct spdk_ioat_chan *ioat) +{ + uint8_t xfercap, version; + uint64_t status; + int i, num_descriptors; + uint64_t comp_update_bus_addr = 0; + uint64_t phys_addr; + + if (ioat_map_pci_bar(ioat) != 0) { + SPDK_ERRLOG("ioat_map_pci_bar() failed\n"); + return -1; + } + + version = ioat->regs->cbver; + if (version < SPDK_IOAT_VER_3_0) { + SPDK_ERRLOG(" unsupported IOAT version %u.%u\n", + version >> 4, version & 0xF); + return -1; + } + + /* Always support DMA copy */ + ioat->dma_capabilities = SPDK_IOAT_ENGINE_COPY_SUPPORTED; + if (ioat->regs->dmacapability & SPDK_IOAT_DMACAP_BFILL) { + ioat->dma_capabilities |= SPDK_IOAT_ENGINE_FILL_SUPPORTED; + } + xfercap = ioat->regs->xfercap; + + /* Only bits [4:0] are valid. */ + xfercap &= 0x1f; + if (xfercap == 0) { + /* 0 means 4 GB max transfer size. */ + ioat->max_xfer_size = 1ULL << 32; + } else if (xfercap < 12) { + /* XFERCAP must be at least 12 (4 KB) according to the spec. */ + SPDK_ERRLOG("invalid XFERCAP value %u\n", xfercap); + return -1; + } else { + ioat->max_xfer_size = 1U << xfercap; + } + + ioat->comp_update = spdk_dma_zmalloc(sizeof(*ioat->comp_update), SPDK_IOAT_CHANCMP_ALIGN, + &comp_update_bus_addr); + if (ioat->comp_update == NULL) { + return -1; + } + + ioat->ring_size_order = IOAT_DEFAULT_ORDER; + + num_descriptors = 1 << ioat->ring_size_order; + + ioat->ring = calloc(num_descriptors, sizeof(struct ioat_descriptor)); + if (!ioat->ring) { + return -1; + } + + ioat->hw_ring = spdk_dma_zmalloc(num_descriptors * sizeof(union spdk_ioat_hw_desc), 64, + NULL); + if (!ioat->hw_ring) { + return -1; + } + + for (i = 0; i < num_descriptors; i++) { + phys_addr = spdk_vtophys(&ioat->hw_ring[i]); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("Failed to translate descriptor %u to physical address\n", i); + return -1; + } + + ioat->ring[i].phys_addr = phys_addr; + ioat->hw_ring[ioat_get_ring_index(ioat, i - 1)].generic.next = phys_addr; + } + + ioat->head = 0; + ioat->tail = 0; + ioat->last_seen = 0; + + ioat_reset_hw(ioat); + + ioat->regs->chanctrl = SPDK_IOAT_CHANCTRL_ANY_ERR_ABORT_EN; + ioat_write_chancmp(ioat, comp_update_bus_addr); + ioat_write_chainaddr(ioat, ioat->ring[0].phys_addr); + + ioat_prep_null(ioat); + ioat_flush(ioat); + + i = 100; + while (i-- > 0) { + spdk_delay_us(100); + status = ioat_get_chansts(ioat); + if (is_ioat_idle(status)) { + break; + } + } + + if (is_ioat_idle(status)) { + ioat_process_channel_events(ioat); + } else { + SPDK_ERRLOG("could not start channel: status = %p\n error = %#x\n", + (void *)status, ioat->regs->chanerr); + return -1; + } + + return 0; +} + +/* Caller must hold g_ioat_driver.lock */ +static struct spdk_ioat_chan * +ioat_attach(struct spdk_pci_device *device) +{ + struct spdk_ioat_chan *ioat; + uint32_t cmd_reg; + + ioat = calloc(1, sizeof(struct spdk_ioat_chan)); + if (ioat == NULL) { + return NULL; + } + + /* Enable PCI busmaster. */ + spdk_pci_device_cfg_read32(device, &cmd_reg, 4); + cmd_reg |= 0x4; + spdk_pci_device_cfg_write32(device, cmd_reg, 4); + + ioat->device = device; + + if (ioat_channel_start(ioat) != 0) { + ioat_channel_destruct(ioat); + free(ioat); + return NULL; + } + + return ioat; +} + +struct ioat_enum_ctx { + spdk_ioat_probe_cb probe_cb; + spdk_ioat_attach_cb attach_cb; + void *cb_ctx; +}; + +/* This function must only be called while holding g_ioat_driver.lock */ +static int +ioat_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct ioat_enum_ctx *enum_ctx = ctx; + struct spdk_ioat_chan *ioat; + + /* Verify that this device is not already attached */ + TAILQ_FOREACH(ioat, &g_ioat_driver.attached_chans, tailq) { + /* + * NOTE: This assumes that the PCI abstraction layer will use the same device handle + * across enumerations; we could compare by BDF instead if this is not true. + */ + if (pci_dev == ioat->device) { + return 0; + } + } + + if (enum_ctx->probe_cb(enum_ctx->cb_ctx, pci_dev)) { + /* + * Since I/OAT init is relatively quick, just perform the full init during probing. + * If this turns out to be a bottleneck later, this can be changed to work like + * NVMe with a list of devices to initialize in parallel. + */ + ioat = ioat_attach(pci_dev); + if (ioat == NULL) { + SPDK_ERRLOG("ioat_attach() failed\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&g_ioat_driver.attached_chans, ioat, tailq); + + enum_ctx->attach_cb(enum_ctx->cb_ctx, pci_dev, ioat); + } + + return 0; +} + +int +spdk_ioat_probe(void *cb_ctx, spdk_ioat_probe_cb probe_cb, spdk_ioat_attach_cb attach_cb) +{ + int rc; + struct ioat_enum_ctx enum_ctx; + + pthread_mutex_lock(&g_ioat_driver.lock); + + enum_ctx.probe_cb = probe_cb; + enum_ctx.attach_cb = attach_cb; + enum_ctx.cb_ctx = cb_ctx; + + rc = spdk_pci_ioat_enumerate(ioat_enum_cb, &enum_ctx); + + pthread_mutex_unlock(&g_ioat_driver.lock); + + return rc; +} + +void +spdk_ioat_detach(struct spdk_ioat_chan *ioat) +{ + struct ioat_driver *driver = &g_ioat_driver; + + /* ioat should be in the free list (not registered to a thread) + * when calling ioat_detach(). + */ + pthread_mutex_lock(&driver->lock); + TAILQ_REMOVE(&driver->attached_chans, ioat, tailq); + pthread_mutex_unlock(&driver->lock); + + ioat_channel_destruct(ioat); + free(ioat); +} + +#define _2MB_PAGE(ptr) ((ptr) & ~(0x200000 - 1)) +#define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1)) + +int +spdk_ioat_submit_copy(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, const void *src, uint64_t nbytes) +{ + struct ioat_descriptor *last_desc; + uint64_t remaining, op_size; + uint64_t vdst, vsrc; + uint64_t vdst_page, vsrc_page; + uint64_t pdst_page, psrc_page; + uint32_t orig_head; + + if (!ioat) { + return -EINVAL; + } + + orig_head = ioat->head; + + vdst = (uint64_t)dst; + vsrc = (uint64_t)src; + vdst_page = vsrc_page = 0; + pdst_page = psrc_page = SPDK_VTOPHYS_ERROR; + + remaining = nbytes; + while (remaining) { + if (_2MB_PAGE(vsrc) != vsrc_page) { + vsrc_page = _2MB_PAGE(vsrc); + psrc_page = spdk_vtophys((void *)vsrc_page); + } + + if (_2MB_PAGE(vdst) != vdst_page) { + vdst_page = _2MB_PAGE(vdst); + pdst_page = spdk_vtophys((void *)vdst_page); + } + op_size = remaining; + op_size = spdk_min(op_size, (0x200000 - _2MB_OFFSET(vsrc))); + op_size = spdk_min(op_size, (0x200000 - _2MB_OFFSET(vdst))); + op_size = spdk_min(op_size, ioat->max_xfer_size); + remaining -= op_size; + + last_desc = ioat_prep_copy(ioat, + pdst_page + _2MB_OFFSET(vdst), + psrc_page + _2MB_OFFSET(vsrc), + op_size); + + if (remaining == 0 || last_desc == NULL) { + break; + } + + vsrc += op_size; + vdst += op_size; + + } + /* Issue null descriptor for null transfer */ + if (nbytes == 0) { + last_desc = ioat_prep_null(ioat); + } + + if (last_desc) { + last_desc->callback_fn = cb_fn; + last_desc->callback_arg = cb_arg; + } else { + /* + * Ran out of descriptors in the ring - reset head to leave things as they were + * in case we managed to fill out any descriptors. + */ + ioat->head = orig_head; + return -ENOMEM; + } + + ioat_flush(ioat); + return 0; +} + +int +spdk_ioat_submit_fill(struct spdk_ioat_chan *ioat, void *cb_arg, spdk_ioat_req_cb cb_fn, + void *dst, uint64_t fill_pattern, uint64_t nbytes) +{ + struct ioat_descriptor *last_desc = NULL; + uint64_t remaining, op_size; + uint64_t vdst; + uint32_t orig_head; + + if (!ioat) { + return -EINVAL; + } + + if (!(ioat->dma_capabilities & SPDK_IOAT_ENGINE_FILL_SUPPORTED)) { + SPDK_ERRLOG("Channel does not support memory fill\n"); + return -1; + } + + orig_head = ioat->head; + + vdst = (uint64_t)dst; + remaining = nbytes; + + while (remaining) { + op_size = remaining; + op_size = spdk_min(op_size, (0x200000 - _2MB_OFFSET(vdst))); + op_size = spdk_min(op_size, ioat->max_xfer_size); + remaining -= op_size; + + last_desc = ioat_prep_fill(ioat, + spdk_vtophys((void *)vdst), + fill_pattern, + op_size); + + if (remaining == 0 || last_desc == NULL) { + break; + } + + vdst += op_size; + } + + if (last_desc) { + last_desc->callback_fn = cb_fn; + last_desc->callback_arg = cb_arg; + } else { + /* + * Ran out of descriptors in the ring - reset head to leave things as they were + * in case we managed to fill out any descriptors. + */ + ioat->head = orig_head; + return -ENOMEM; + } + + ioat_flush(ioat); + return 0; +} + +uint32_t +spdk_ioat_get_dma_capabilities(struct spdk_ioat_chan *ioat) +{ + if (!ioat) { + return 0; + } + return ioat->dma_capabilities; +} + +int +spdk_ioat_process_events(struct spdk_ioat_chan *ioat) +{ + return ioat_process_channel_events(ioat); +} + +SPDK_LOG_REGISTER_COMPONENT("ioat", SPDK_LOG_IOAT) diff --git a/src/spdk/lib/ioat/ioat_internal.h b/src/spdk/lib/ioat/ioat_internal.h new file mode 100644 index 00000000..19593bb0 --- /dev/null +++ b/src/spdk/lib/ioat/ioat_internal.h @@ -0,0 +1,100 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __IOAT_INTERNAL_H__ +#define __IOAT_INTERNAL_H__ + +#include "spdk/stdinc.h" + +#include "spdk/ioat.h" +#include "spdk/ioat_spec.h" +#include "spdk/queue.h" +#include "spdk/mmio.h" + +/* Allocate 1 << 15 (32K) descriptors per channel by default. */ +#define IOAT_DEFAULT_ORDER 15 + +struct ioat_descriptor { + uint64_t phys_addr; + spdk_ioat_req_cb callback_fn; + void *callback_arg; +}; + +/* One of these per allocated PCI device. */ +struct spdk_ioat_chan { + /* Opaque handle to upper layer */ + struct spdk_pci_device *device; + uint64_t max_xfer_size; + volatile struct spdk_ioat_registers *regs; + + volatile uint64_t *comp_update; + + uint32_t head; + uint32_t tail; + + uint32_t ring_size_order; + uint64_t last_seen; + + struct ioat_descriptor *ring; + union spdk_ioat_hw_desc *hw_ring; + uint32_t dma_capabilities; + + /* tailq entry for attached_chans */ + TAILQ_ENTRY(spdk_ioat_chan) tailq; +}; + +static inline uint32_t +is_ioat_active(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_ACTIVE; +} + +static inline uint32_t +is_ioat_idle(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_IDLE; +} + +static inline uint32_t +is_ioat_halted(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_HALTED; +} + +static inline uint32_t +is_ioat_suspended(uint64_t status) +{ + return (status & SPDK_IOAT_CHANSTS_STATUS) == SPDK_IOAT_CHANSTS_SUSPENDED; +} + +#endif /* __IOAT_INTERNAL_H__ */ diff --git a/src/spdk/lib/iscsi/Makefile b/src/spdk/lib/iscsi/Makefile new file mode 100644 index 00000000..624bbf95 --- /dev/null +++ b/src/spdk/lib/iscsi/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I$(SPDK_ROOT_DIR)/lib +C_SRCS = acceptor.c conn.c \ + init_grp.c iscsi.c md5.c param.c portal_grp.c \ + tgt_node.c iscsi_subsystem.c \ + iscsi_rpc.c task.c +LIBNAME = iscsi +LOCAL_SYS_LIBS = -lcrypto + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/iscsi/acceptor.c b/src/spdk/lib/iscsi/acceptor.c new file mode 100644 index 00000000..9b13de30 --- /dev/null +++ b/src/spdk/lib/iscsi/acceptor.c @@ -0,0 +1,91 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/log.h" +#include "spdk/sock.h" +#include "spdk/string.h" +#include "iscsi/acceptor.h" +#include "iscsi/conn.h" +#include "iscsi/portal_grp.h" + +#define ACCEPT_TIMEOUT_US 1000 /* 1ms */ + +static int +spdk_iscsi_portal_accept(void *arg) +{ + struct spdk_iscsi_portal *portal = arg; + struct spdk_sock *sock; + int rc; + int count = 0; + + if (portal->sock == NULL) { + return -1; + } + + while (1) { + sock = spdk_sock_accept(portal->sock); + if (sock != NULL) { + rc = spdk_iscsi_conn_construct(portal, sock); + if (rc < 0) { + spdk_sock_close(&sock); + SPDK_ERRLOG("spdk_iscsi_connection_construct() failed\n"); + break; + } + count++; + } else { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + SPDK_ERRLOG("accept error(%d): %s\n", errno, spdk_strerror(errno)); + } + break; + } + } + + return count; +} + +void +spdk_iscsi_acceptor_start(struct spdk_iscsi_portal *p) +{ + p->acceptor_poller = spdk_poller_register(spdk_iscsi_portal_accept, p, ACCEPT_TIMEOUT_US); +} + +void +spdk_iscsi_acceptor_stop(struct spdk_iscsi_portal *p) +{ + spdk_poller_unregister(&p->acceptor_poller); +} diff --git a/src/spdk/lib/iscsi/acceptor.h b/src/spdk/lib/iscsi/acceptor.h new file mode 100644 index 00000000..9060ee7d --- /dev/null +++ b/src/spdk/lib/iscsi/acceptor.h @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ACCEPTOR_H_ +#define SPDK_ACCEPTOR_H_ + +struct spdk_iscsi_portal; + +void spdk_iscsi_acceptor_start(struct spdk_iscsi_portal *p); +void spdk_iscsi_acceptor_stop(struct spdk_iscsi_portal *p); + +#endif /* SPDK_ACCEPTOR_H_ */ diff --git a/src/spdk/lib/iscsi/conn.c b/src/spdk/lib/iscsi/conn.c new file mode 100644 index 00000000..d5cd5d1e --- /dev/null +++ b/src/spdk/lib/iscsi/conn.c @@ -0,0 +1,1470 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/thread.h" +#include "spdk/queue.h" +#include "spdk/trace.h" +#include "spdk/net.h" +#include "spdk/sock.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/task.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" + +#define SPDK_ISCSI_CONNECTION_MEMSET(conn) \ + memset(&(conn)->portal, 0, sizeof(*(conn)) - \ + offsetof(struct spdk_iscsi_conn, portal)); + +static int g_connections_per_lcore; +static uint32_t *g_num_connections; + +struct spdk_iscsi_conn *g_conns_array = MAP_FAILED; +static int g_conns_array_fd = -1; +static char g_shm_name[64]; + +static pthread_mutex_t g_conns_mutex = PTHREAD_MUTEX_INITIALIZER; + +static struct spdk_poller *g_shutdown_timer = NULL; + +static uint32_t spdk_iscsi_conn_allocate_reactor(const struct spdk_cpuset *cpumask); + +static void spdk_iscsi_conn_full_feature_migrate(void *arg1, void *arg2); +static void spdk_iscsi_conn_stop(struct spdk_iscsi_conn *conn); +static void spdk_iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, + struct spdk_sock *sock); + +static struct spdk_iscsi_conn * +allocate_conn(void) +{ + struct spdk_iscsi_conn *conn; + int i; + + pthread_mutex_lock(&g_conns_mutex); + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + conn = &g_conns_array[i]; + if (!conn->is_valid) { + SPDK_ISCSI_CONNECTION_MEMSET(conn); + conn->is_valid = 1; + pthread_mutex_unlock(&g_conns_mutex); + return conn; + } + } + pthread_mutex_unlock(&g_conns_mutex); + + return NULL; +} + +static void +free_conn(struct spdk_iscsi_conn *conn) +{ + free(conn->portal_host); + free(conn->portal_port); + conn->is_valid = 0; +} + +static struct spdk_iscsi_conn * +spdk_find_iscsi_connection_by_id(int cid) +{ + if (g_conns_array[cid].is_valid == 1) { + return &g_conns_array[cid]; + } else { + return NULL; + } +} + +int spdk_initialize_iscsi_conns(void) +{ + size_t conns_size = sizeof(struct spdk_iscsi_conn) * MAX_ISCSI_CONNECTIONS; + uint32_t i, last_core; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_init\n"); + + snprintf(g_shm_name, sizeof(g_shm_name), "/spdk_iscsi_conns.%d", spdk_app_get_shm_id()); + g_conns_array_fd = shm_open(g_shm_name, O_RDWR | O_CREAT, 0600); + if (g_conns_array_fd < 0) { + SPDK_ERRLOG("could not shm_open %s\n", g_shm_name); + goto err; + } + + if (ftruncate(g_conns_array_fd, conns_size) != 0) { + SPDK_ERRLOG("could not ftruncate\n"); + goto err; + } + g_conns_array = mmap(0, conns_size, PROT_READ | PROT_WRITE, MAP_SHARED, + g_conns_array_fd, 0); + + if (g_conns_array == MAP_FAILED) { + fprintf(stderr, "could not mmap cons array file %s (%d)\n", g_shm_name, errno); + goto err; + } + + memset(g_conns_array, 0, conns_size); + + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + g_conns_array[i].id = i; + } + + last_core = spdk_env_get_last_core(); + g_num_connections = calloc(last_core + 1, sizeof(uint32_t)); + if (!g_num_connections) { + SPDK_ERRLOG("Could not allocate array size=%u for g_num_connections\n", + last_core + 1); + goto err; + } + + return 0; + +err: + if (g_conns_array != MAP_FAILED) { + munmap(g_conns_array, conns_size); + g_conns_array = MAP_FAILED; + } + + if (g_conns_array_fd >= 0) { + close(g_conns_array_fd); + g_conns_array_fd = -1; + shm_unlink(g_shm_name); + } + + return -1; +} + +static void +spdk_iscsi_poll_group_add_conn_sock(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_poll_group *poll_group; + int rc; + + assert(conn->lcore == spdk_env_get_current_core()); + + poll_group = &g_spdk_iscsi.poll_group[conn->lcore]; + + rc = spdk_sock_group_add_sock(poll_group->sock_group, conn->sock, spdk_iscsi_conn_sock_cb, conn); + if (rc < 0) { + SPDK_ERRLOG("Failed to add sock=%p of conn=%p\n", conn->sock, conn); + } +} + +static void +spdk_iscsi_poll_group_remove_conn_sock(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_poll_group *poll_group; + int rc; + + assert(conn->lcore == spdk_env_get_current_core()); + + poll_group = &g_spdk_iscsi.poll_group[conn->lcore]; + + rc = spdk_sock_group_remove_sock(poll_group->sock_group, conn->sock); + if (rc < 0) { + SPDK_ERRLOG("Failed to remove sock=%p of conn=%p\n", conn->sock, conn); + } +} + +static void +spdk_iscsi_poll_group_add_conn(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_poll_group *poll_group; + + assert(conn->lcore == spdk_env_get_current_core()); + + poll_group = &g_spdk_iscsi.poll_group[conn->lcore]; + + conn->is_stopped = false; + STAILQ_INSERT_TAIL(&poll_group->connections, conn, link); + spdk_iscsi_poll_group_add_conn_sock(conn); +} + +static void +spdk_iscsi_poll_group_remove_conn(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_poll_group *poll_group; + + assert(conn->lcore == spdk_env_get_current_core()); + + poll_group = &g_spdk_iscsi.poll_group[conn->lcore]; + + conn->is_stopped = true; + STAILQ_REMOVE(&poll_group->connections, conn, spdk_iscsi_conn, link); +} + +/** + * \brief Create an iSCSI connection from the given parameters and schedule it + * on a reactor. + * + * \code + * + * # identify reactor where the new connections work item will be scheduled + * reactor = spdk_iscsi_conn_allocate_reactor() + * allocate spdk_iscsi_conn object + * initialize spdk_iscsi_conn object + * schedule iSCSI connection work item on reactor + * + * \endcode + */ +int +spdk_iscsi_conn_construct(struct spdk_iscsi_portal *portal, + struct spdk_sock *sock) +{ + struct spdk_iscsi_conn *conn; + int bufsize, i, rc; + + conn = allocate_conn(); + if (conn == NULL) { + SPDK_ERRLOG("Could not allocate connection.\n"); + return -1; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + conn->timeout = g_spdk_iscsi.timeout; + conn->nopininterval = g_spdk_iscsi.nopininterval; + conn->nopininterval *= spdk_get_ticks_hz(); /* seconds to TSC */ + conn->nop_outstanding = false; + conn->data_out_cnt = 0; + conn->data_in_cnt = 0; + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + conn->MaxRecvDataSegmentLength = 8192; // RFC3720(12.12) + + conn->portal = portal; + conn->pg_tag = portal->group->tag; + conn->portal_host = strdup(portal->host); + conn->portal_port = strdup(portal->port); + conn->portal_cpumask = portal->cpumask; + conn->sock = sock; + + conn->state = ISCSI_CONN_STATE_INVALID; + conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE; + conn->ttt = 0; + + conn->partial_text_parameter = NULL; + + for (i = 0; i < MAX_CONNECTION_PARAMS; i++) { + conn->conn_param_state_negotiated[i] = false; + } + + for (i = 0; i < MAX_SESSION_PARAMS; i++) { + conn->sess_param_state_negotiated[i] = false; + } + + for (i = 0; i < DEFAULT_MAXR2T; i++) { + conn->outstanding_r2t_tasks[i] = NULL; + } + + TAILQ_INIT(&conn->write_pdu_list); + TAILQ_INIT(&conn->snack_pdu_list); + TAILQ_INIT(&conn->queued_r2t_tasks); + TAILQ_INIT(&conn->active_r2t_tasks); + TAILQ_INIT(&conn->queued_datain_tasks); + memset(&conn->open_lun_descs, 0, sizeof(conn->open_lun_descs)); + + rc = spdk_sock_getaddr(sock, conn->target_addr, sizeof conn->target_addr, NULL, + conn->initiator_addr, sizeof conn->initiator_addr, NULL); + if (rc < 0) { + SPDK_ERRLOG("spdk_sock_getaddr() failed\n"); + goto error_return; + } + + bufsize = 2 * 1024 * 1024; + rc = spdk_sock_set_recvbuf(conn->sock, bufsize); + if (rc != 0) { + SPDK_ERRLOG("spdk_sock_set_recvbuf failed\n"); + } + + bufsize = 32 * 1024 * 1024 / g_spdk_iscsi.MaxConnections; + if (bufsize > 2 * 1024 * 1024) { + bufsize = 2 * 1024 * 1024; + } + rc = spdk_sock_set_sendbuf(conn->sock, bufsize); + if (rc != 0) { + SPDK_ERRLOG("spdk_sock_set_sendbuf failed\n"); + } + + /* set low water mark */ + rc = spdk_sock_set_recvlowat(conn->sock, 1); + if (rc != 0) { + SPDK_ERRLOG("spdk_sock_set_recvlowat() failed\n"); + goto error_return; + } + + /* set default params */ + rc = spdk_iscsi_conn_params_init(&conn->params); + if (rc < 0) { + SPDK_ERRLOG("iscsi_conn_params_init() failed\n"); +error_return: + spdk_iscsi_param_free(conn->params); + free_conn(conn); + return -1; + } + conn->logout_timer = NULL; + conn->shutdown_timer = NULL; + SPDK_NOTICELOG("Launching connection on acceptor thread\n"); + conn->pending_task_cnt = 0; + conn->pending_activate_event = false; + + conn->lcore = spdk_env_get_current_core(); + __sync_fetch_and_add(&g_num_connections[conn->lcore], 1); + + spdk_iscsi_poll_group_add_conn(conn); + return 0; +} + +void +spdk_iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + if (pdu->task) { + if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + if (pdu->task->scsi.offset > 0) { + conn->data_in_cnt--; + if (pdu->bhs.flags & ISCSI_DATAIN_STATUS) { + /* Free the primary task after the last subtask done */ + conn->data_in_cnt--; + spdk_iscsi_task_put(spdk_iscsi_task_get_primary(pdu->task)); + } + } + } else if (pdu->bhs.opcode == ISCSI_OP_SCSI_RSP && + pdu->task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + if (pdu->task->scsi.offset > 0) { + spdk_iscsi_task_put(spdk_iscsi_task_get_primary(pdu->task)); + } + } + spdk_iscsi_task_put(pdu->task); + } + spdk_put_pdu(pdu); +} + +static int spdk_iscsi_conn_free_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *pdu, *tmp_pdu; + struct spdk_iscsi_task *iscsi_task, *tmp_iscsi_task; + + TAILQ_FOREACH_SAFE(pdu, &conn->write_pdu_list, tailq, tmp_pdu) { + TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq); + spdk_iscsi_conn_free_pdu(conn, pdu); + } + + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + if (pdu->task) { + spdk_iscsi_task_put(pdu->task); + } + spdk_put_pdu(pdu); + } + + TAILQ_FOREACH_SAFE(iscsi_task, &conn->queued_datain_tasks, link, tmp_iscsi_task) { + if (!iscsi_task->is_queued) { + TAILQ_REMOVE(&conn->queued_datain_tasks, iscsi_task, link); + spdk_iscsi_task_put(iscsi_task); + } + } + + if (conn->pending_task_cnt) { + return -1; + } + + return 0; +} + +static void spdk_iscsi_conn_free(struct spdk_iscsi_conn *conn) +{ + if (conn == NULL) { + return; + } + + spdk_iscsi_param_free(conn->params); + + /* + * Each connection pre-allocates its next PDU - make sure these get + * freed here. + */ + spdk_put_pdu(conn->pdu_in_progress); + + free_conn(conn); +} + +static void spdk_iscsi_remove_conn(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_sess *sess; + int idx; + uint32_t i, j; + + idx = -1; + sess = conn->sess; + conn->sess = NULL; + if (sess == NULL) { + spdk_iscsi_conn_free(conn); + return; + } + + for (i = 0; i < sess->connections; i++) { + if (sess->conns[i] == conn) { + idx = i; + break; + } + } + + if (sess->connections < 1) { + SPDK_ERRLOG("zero connection\n"); + sess->connections = 0; + } else { + if (idx < 0) { + SPDK_ERRLOG("remove conn not found\n"); + } else { + for (j = idx; j < sess->connections - 1; j++) { + sess->conns[j] = sess->conns[j + 1]; + } + sess->conns[sess->connections - 1] = NULL; + } + sess->connections--; + } + + SPDK_NOTICELOG("Terminating connections(tsih %d): %d\n", sess->tsih, sess->connections); + + if (sess->connections == 0) { + /* cleanup last connection */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "cleanup last conn free sess\n"); + spdk_free_sess(sess); + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "cleanup free conn\n"); + spdk_iscsi_conn_free(conn); +} + +static void +spdk_iscsi_conn_cleanup_backend(struct spdk_iscsi_conn *conn) +{ + int rc; + struct spdk_iscsi_tgt_node *target; + + if (conn->sess->connections > 1) { + /* connection specific cleanup */ + } else if (!g_spdk_iscsi.AllowDuplicateIsid) { + /* clean up all tasks to all LUNs for session */ + target = conn->sess->target; + if (target != NULL) { + rc = spdk_iscsi_tgt_node_cleanup_luns(conn, target); + if (rc < 0) { + SPDK_ERRLOG("target abort failed\n"); + } + } + } +} + +static void +_spdk_iscsi_conn_free(struct spdk_iscsi_conn *conn) +{ + pthread_mutex_lock(&g_conns_mutex); + spdk_iscsi_remove_conn(conn); + pthread_mutex_unlock(&g_conns_mutex); +} + +static int +_spdk_iscsi_conn_check_shutdown(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + int rc; + + rc = spdk_iscsi_conn_free_tasks(conn); + if (rc < 0) { + return -1; + } + + spdk_poller_unregister(&conn->shutdown_timer); + + spdk_iscsi_conn_stop(conn); + _spdk_iscsi_conn_free(conn); + + return -1; +} + +static void +_spdk_iscsi_conn_destruct(struct spdk_iscsi_conn *conn) +{ + int rc; + + spdk_clear_all_transfer_task(conn, NULL); + spdk_iscsi_poll_group_remove_conn_sock(conn); + spdk_sock_close(&conn->sock); + spdk_poller_unregister(&conn->logout_timer); + spdk_poller_unregister(&conn->flush_poller); + + rc = spdk_iscsi_conn_free_tasks(conn); + if (rc < 0) { + /* The connection cannot be freed yet. Check back later. */ + conn->shutdown_timer = spdk_poller_register(_spdk_iscsi_conn_check_shutdown, conn, 1000); + } else { + spdk_iscsi_conn_stop(conn); + _spdk_iscsi_conn_free(conn); + } +} + +static int +_spdk_iscsi_conn_check_pending_tasks(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + if (conn->dev != NULL && spdk_scsi_dev_has_pending_tasks(conn->dev)) { + return -1; + } + + spdk_poller_unregister(&conn->shutdown_timer); + + _spdk_iscsi_conn_destruct(conn); + + return -1; +} + +void +spdk_iscsi_conn_destruct(struct spdk_iscsi_conn *conn) +{ + conn->state = ISCSI_CONN_STATE_EXITED; + + if (conn->sess != NULL && conn->pending_task_cnt > 0) { + spdk_iscsi_conn_cleanup_backend(conn); + } + + if (conn->dev != NULL && spdk_scsi_dev_has_pending_tasks(conn->dev)) { + conn->shutdown_timer = spdk_poller_register(_spdk_iscsi_conn_check_pending_tasks, conn, 1000); + } else { + _spdk_iscsi_conn_destruct(conn); + } +} + +static int +spdk_iscsi_get_active_conns(void) +{ + struct spdk_iscsi_conn *conn; + int num = 0; + int i; + + pthread_mutex_lock(&g_conns_mutex); + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + conn = spdk_find_iscsi_connection_by_id(i); + if (conn == NULL) { + continue; + } + num++; + } + pthread_mutex_unlock(&g_conns_mutex); + return num; +} + +static void +spdk_iscsi_conns_cleanup(void) +{ + free(g_num_connections); + munmap(g_conns_array, sizeof(struct spdk_iscsi_conn) * + MAX_ISCSI_CONNECTIONS); + shm_unlink(g_shm_name); + if (g_conns_array_fd >= 0) { + close(g_conns_array_fd); + g_conns_array_fd = -1; + } +} + +static void +spdk_iscsi_conn_check_shutdown_cb(void *arg1, void *arg2) +{ + spdk_iscsi_conns_cleanup(); + spdk_shutdown_iscsi_conns_done(); +} + +static int +spdk_iscsi_conn_check_shutdown(void *arg) +{ + struct spdk_event *event; + + if (spdk_iscsi_get_active_conns() == 0) { + spdk_poller_unregister(&g_shutdown_timer); + event = spdk_event_allocate(spdk_env_get_current_core(), + spdk_iscsi_conn_check_shutdown_cb, NULL, NULL); + spdk_event_call(event); + } + + return -1; +} + +static void +spdk_iscsi_conn_close_lun(struct spdk_iscsi_conn *conn, int lun_id) +{ + struct spdk_scsi_desc *desc; + + desc = conn->open_lun_descs[lun_id]; + if (desc != NULL) { + spdk_scsi_lun_free_io_channel(desc); + spdk_scsi_lun_close(desc); + conn->open_lun_descs[lun_id] = NULL; + } +} + +static void +spdk_iscsi_conn_close_luns(struct spdk_iscsi_conn *conn) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + spdk_iscsi_conn_close_lun(conn, i); + } +} + +static void +_iscsi_conn_remove_lun(void *arg1, void *arg2) +{ + struct spdk_iscsi_conn *conn = arg1; + struct spdk_scsi_lun *lun = arg2; + int lun_id = spdk_scsi_lun_get_id(lun); + struct spdk_iscsi_pdu *pdu, *tmp_pdu; + struct spdk_iscsi_task *iscsi_task, *tmp_iscsi_task; + + /* If a connection is already in stating status, just return */ + if (conn->state >= ISCSI_CONN_STATE_EXITING) { + return; + } + + spdk_clear_all_transfer_task(conn, lun); + TAILQ_FOREACH_SAFE(pdu, &conn->write_pdu_list, tailq, tmp_pdu) { + if (pdu->task && (lun == pdu->task->scsi.lun)) { + TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq); + spdk_iscsi_conn_free_pdu(conn, pdu); + } + } + + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, tmp_pdu) { + if (pdu->task && (lun == pdu->task->scsi.lun)) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + spdk_iscsi_task_put(pdu->task); + spdk_put_pdu(pdu); + } + } + + TAILQ_FOREACH_SAFE(iscsi_task, &conn->queued_datain_tasks, link, tmp_iscsi_task) { + if ((!iscsi_task->is_queued) && (lun == iscsi_task->scsi.lun)) { + TAILQ_REMOVE(&conn->queued_datain_tasks, iscsi_task, link); + spdk_iscsi_task_put(iscsi_task); + } + } + + spdk_iscsi_conn_close_lun(conn, lun_id); +} + +static void +spdk_iscsi_conn_remove_lun(struct spdk_scsi_lun *lun, void *remove_ctx) +{ + struct spdk_iscsi_conn *conn = remove_ctx; + struct spdk_event *event; + + event = spdk_event_allocate(conn->lcore, _iscsi_conn_remove_lun, + conn, lun); + spdk_event_call(event); +} + +static void +spdk_iscsi_conn_open_luns(struct spdk_iscsi_conn *conn) +{ + int i, rc; + struct spdk_scsi_lun *lun; + struct spdk_scsi_desc *desc; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + lun = spdk_scsi_dev_get_lun(conn->dev, i); + if (lun == NULL) { + continue; + } + + rc = spdk_scsi_lun_open(lun, spdk_iscsi_conn_remove_lun, conn, &desc); + if (rc != 0) { + goto error; + } + + rc = spdk_scsi_lun_allocate_io_channel(desc); + if (rc != 0) { + spdk_scsi_lun_close(desc); + goto error; + } + + conn->open_lun_descs[i] = desc; + } + + return; + +error: + spdk_iscsi_conn_close_luns(conn); +} + +/** + * This function will stop executing the specified connection. + */ +static void +spdk_iscsi_conn_stop(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_tgt_node *target; + + if (conn->state == ISCSI_CONN_STATE_EXITED && conn->sess != NULL && + conn->sess->session_type == SESSION_TYPE_NORMAL && + conn->full_feature) { + target = conn->sess->target; + pthread_mutex_lock(&target->mutex); + target->num_active_conns--; + pthread_mutex_unlock(&target->mutex); + + spdk_iscsi_conn_close_luns(conn); + } + + assert(conn->lcore == spdk_env_get_current_core()); + + __sync_fetch_and_sub(&g_num_connections[conn->lcore], 1); + spdk_iscsi_poll_group_remove_conn(conn); +} + +void spdk_shutdown_iscsi_conns(void) +{ + struct spdk_iscsi_conn *conn; + int i; + + pthread_mutex_lock(&g_conns_mutex); + + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + conn = spdk_find_iscsi_connection_by_id(i); + if (conn == NULL) { + continue; + } + + /* Do not set conn->state if the connection has already started exiting. + * This ensures we do not move a connection from EXITED state back to EXITING. + */ + if (conn->state < ISCSI_CONN_STATE_EXITING) { + conn->state = ISCSI_CONN_STATE_EXITING; + } + } + + pthread_mutex_unlock(&g_conns_mutex); + g_shutdown_timer = spdk_poller_register(spdk_iscsi_conn_check_shutdown, NULL, + 1000); +} + +int +spdk_iscsi_drop_conns(struct spdk_iscsi_conn *conn, const char *conn_match, + int drop_all) +{ + struct spdk_iscsi_conn *xconn; + const char *xconn_match; + int i, num; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_drop_conns\n"); + + num = 0; + pthread_mutex_lock(&g_conns_mutex); + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + xconn = spdk_find_iscsi_connection_by_id(i); + + if (xconn == NULL) { + continue; + } + + if (xconn == conn) { + continue; + } + + if (!drop_all && xconn->initiator_port == NULL) { + continue; + } + + xconn_match = + drop_all ? xconn->initiator_name : spdk_scsi_port_get_name(xconn->initiator_port); + + if (!strcasecmp(conn_match, xconn_match) && + conn->target == xconn->target) { + + if (num == 0) { + /* + * Only print this message before we report the + * first dropped connection. + */ + SPDK_ERRLOG("drop old connections %s by %s\n", + conn->target->name, conn_match); + } + + SPDK_ERRLOG("exiting conn by %s (%s)\n", + xconn_match, xconn->initiator_addr); + if (xconn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=%u\n", xconn->sess->tsih); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "TSIH=xx\n"); + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CID=%u\n", xconn->cid); + + /* Do not set xconn->state if the connection has already started exiting. + * This ensures we do not move a connection from EXITED state back to EXITING. + */ + if (xconn->state < ISCSI_CONN_STATE_EXITING) { + xconn->state = ISCSI_CONN_STATE_EXITING; + } + num++; + } + } + + pthread_mutex_unlock(&g_conns_mutex); + + if (num != 0) { + SPDK_ERRLOG("exiting %d conns\n", num); + } + + return 0; +} + +/** + * \brief Reads data for the specified iSCSI connection from its TCP socket. + * + * The TCP socket is marked as non-blocking, so this function may not read + * all data requested. + * + * Returns SPDK_ISCSI_CONNECTION_FATAL if the recv() operation indicates a fatal + * error with the TCP connection (including if the TCP connection was closed + * unexpectedly. + * + * Otherwise returns the number of bytes successfully read. + */ +int +spdk_iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int bytes, + void *buf) +{ + int ret; + + if (bytes == 0) { + return 0; + } + + ret = spdk_sock_recv(conn->sock, buf, bytes); + + if (ret > 0) { + spdk_trace_record(TRACE_ISCSI_READ_FROM_SOCKET_DONE, conn->id, ret, 0, 0); + return ret; + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } + + /* For connect reset issue, do not output error log */ + if (errno == ECONNRESET) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_sock_recv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } else { + SPDK_ERRLOG("spdk_sock_recv() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + } + } + + /* connection closed */ + return SPDK_ISCSI_CONNECTION_FATAL; +} + +void +spdk_iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *task = spdk_iscsi_task_from_scsi_task(scsi_task); + + spdk_iscsi_task_mgmt_response(task->conn, task); + spdk_iscsi_task_put(task); +} + +static void +process_completed_read_subtask_list(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *primary) +{ + struct spdk_iscsi_task *subtask, *tmp; + + TAILQ_FOREACH_SAFE(subtask, &primary->subtask_list, subtask_link, tmp) { + if (subtask->scsi.offset == primary->bytes_completed) { + TAILQ_REMOVE(&primary->subtask_list, subtask, subtask_link); + primary->bytes_completed += subtask->scsi.length; + spdk_iscsi_task_response(conn, subtask); + spdk_iscsi_task_put(subtask); + } else { + break; + } + } +} + +static void +process_read_task_completion(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_task *primary) +{ + struct spdk_iscsi_task *tmp; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) { + spdk_scsi_task_copy_status(&tmp->scsi, &task->scsi); + } + } + + if ((task != primary) && + (task->scsi.offset != primary->bytes_completed)) { + TAILQ_FOREACH(tmp, &primary->subtask_list, subtask_link) { + if (task->scsi.offset < tmp->scsi.offset) { + TAILQ_INSERT_BEFORE(tmp, task, subtask_link); + return; + } + } + + TAILQ_INSERT_TAIL(&primary->subtask_list, task, subtask_link); + return; + } + + primary->bytes_completed += task->scsi.length; + spdk_iscsi_task_response(conn, task); + + if ((task != primary) || + (task->scsi.transfer_len == task->scsi.length)) { + spdk_iscsi_task_put(task); + } + process_completed_read_subtask_list(conn, primary); + + spdk_iscsi_conn_handle_queued_datain_tasks(conn); +} + +void +spdk_iscsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *primary; + struct spdk_iscsi_task *task = spdk_iscsi_task_from_scsi_task(scsi_task); + struct spdk_iscsi_conn *conn = task->conn; + struct spdk_iscsi_pdu *pdu = task->pdu; + + spdk_trace_record(TRACE_ISCSI_TASK_DONE, conn->id, 0, (uintptr_t)task, 0); + + task->is_queued = false; + primary = spdk_iscsi_task_get_primary(task); + + if (spdk_iscsi_task_is_read(primary)) { + process_read_task_completion(conn, task, primary); + } else { + primary->bytes_completed += task->scsi.length; + if (task != primary) { + if (task->scsi.status == SPDK_SCSI_STATUS_GOOD) { + primary->scsi.data_transferred += task->scsi.data_transferred; + } else { + spdk_scsi_task_copy_status(&primary->scsi, &task->scsi); + } + } + + if (primary->bytes_completed == primary->scsi.transfer_len) { + spdk_del_transfer_task(conn, primary->tag); + spdk_iscsi_task_response(conn, primary); + /* + * Check if this is the last task completed for an iSCSI write + * that required child subtasks. If task != primary, we know + * for sure that it was part of an iSCSI write with child subtasks. + * The trickier case is when the last task completed was the initial + * task - in this case the task will have a smaller length than + * the overall transfer length. + */ + if (task != primary || task->scsi.length != task->scsi.transfer_len) { + TAILQ_REMOVE(&conn->active_r2t_tasks, primary, link); + spdk_iscsi_task_put(primary); + } + } + spdk_iscsi_task_put(task); + } + if (!task->parent) { + spdk_trace_record(TRACE_ISCSI_PDU_COMPLETED, 0, 0, (uintptr_t)pdu, 0); + } +} + +static int +spdk_iscsi_get_pdu_length(struct spdk_iscsi_pdu *pdu, int header_digest, + int data_digest) +{ + int data_len, enable_digest, total; + + enable_digest = 1; + if (pdu->bhs.opcode == ISCSI_OP_LOGIN_RSP) { + enable_digest = 0; + } + + total = ISCSI_BHS_LEN; + + total += (4 * pdu->bhs.total_ahs_len); + + if (enable_digest && header_digest) { + total += ISCSI_DIGEST_LEN; + } + + data_len = DGET24(pdu->bhs.data_segment_len); + if (data_len > 0) { + total += ISCSI_ALIGN(data_len); + if (enable_digest && data_digest) { + total += ISCSI_DIGEST_LEN; + } + } + + return total; +} + +void +spdk_iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn) +{ + uint64_t tsc; + + /** + * This function will be executed by nop_poller of iSCSI polling group, so + * we need to check the connection state first, then do the nop interval + * expiration check work. + */ + if ((conn->state == ISCSI_CONN_STATE_EXITED) || + (conn->state == ISCSI_CONN_STATE_EXITING)) { + return; + } + + /* Check for nop interval expiration */ + tsc = spdk_get_ticks(); + if (conn->nop_outstanding) { + if ((tsc - conn->last_nopin) > (conn->timeout * spdk_get_ticks_hz())) { + SPDK_ERRLOG("Timed out waiting for NOP-Out response from initiator\n"); + SPDK_ERRLOG(" tsc=0x%lx, last_nopin=0x%lx\n", tsc, conn->last_nopin); + SPDK_ERRLOG(" initiator=%s, target=%s\n", conn->initiator_name, + conn->target_short_name); + conn->state = ISCSI_CONN_STATE_EXITING; + } + } else if (tsc - conn->last_nopin > conn->nopininterval) { + spdk_iscsi_send_nopin(conn); + } +} + +/** + * \brief Makes one attempt to flush response PDUs back to the initiator. + * + * Builds a list of iovecs for response PDUs that must be sent back to the + * initiator and passes it to writev(). + * + * Since the socket is non-blocking, writev() may not be able to flush all + * of the iovecs, and may even partially flush one of the iovecs. In this + * case, the partially flushed PDU will remain on the write_pdu_list with + * an offset pointing to the next byte to be flushed. + * + * Returns 0 if all PDUs were flushed. + * + * Returns 1 if some PDUs could not be flushed due to lack of send buffer + * space. + * + * Returns -1 if an exception error occurred indicating the TCP connection + * should be closed. + */ +static int +spdk_iscsi_conn_flush_pdus_internal(struct spdk_iscsi_conn *conn) +{ + const int array_size = 32; + struct iovec iovec_array[array_size]; + struct iovec *iov = iovec_array; + int iovec_cnt = 0; + int bytes = 0; + int total_length = 0; + uint32_t writev_offset; + struct spdk_iscsi_pdu *pdu; + int pdu_length; + + pdu = TAILQ_FIRST(&conn->write_pdu_list); + + if (pdu == NULL) { + return 0; + } + + /* + * Build up a list of iovecs for the first few PDUs in the + * connection's write_pdu_list. + */ + while (pdu != NULL && ((array_size - iovec_cnt) >= 5)) { + pdu_length = spdk_iscsi_get_pdu_length(pdu, + conn->header_digest, + conn->data_digest); + iovec_cnt += spdk_iscsi_build_iovecs(conn, + &iovec_array[iovec_cnt], + pdu); + total_length += pdu_length; + pdu = TAILQ_NEXT(pdu, tailq); + } + + /* + * Check if the first PDU was partially written out the last time + * this function was called, and if so adjust the iovec array + * accordingly. + */ + writev_offset = TAILQ_FIRST(&conn->write_pdu_list)->writev_offset; + total_length -= writev_offset; + while (writev_offset > 0) { + if (writev_offset >= iov->iov_len) { + writev_offset -= iov->iov_len; + iov++; + iovec_cnt--; + } else { + iov->iov_len -= writev_offset; + iov->iov_base = (char *)iov->iov_base + writev_offset; + writev_offset = 0; + } + } + + spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_START, conn->id, total_length, 0, iovec_cnt); + + bytes = spdk_sock_writev(conn->sock, iov, iovec_cnt); + if (bytes == -1) { + if (errno == EWOULDBLOCK || errno == EAGAIN) { + return 1; + } else { + SPDK_ERRLOG("spdk_sock_writev() failed, errno %d: %s\n", + errno, spdk_strerror(errno)); + return -1; + } + } + + spdk_trace_record(TRACE_ISCSI_FLUSH_WRITEBUF_DONE, conn->id, bytes, 0, 0); + + pdu = TAILQ_FIRST(&conn->write_pdu_list); + + /* + * Free any PDUs that were fully written. If a PDU was only + * partially written, update its writev_offset so that next + * time only the unwritten portion will be sent to writev(). + */ + while (bytes > 0) { + pdu_length = spdk_iscsi_get_pdu_length(pdu, + conn->header_digest, + conn->data_digest); + pdu_length -= pdu->writev_offset; + + if (bytes >= pdu_length) { + bytes -= pdu_length; + TAILQ_REMOVE(&conn->write_pdu_list, pdu, tailq); + + if ((conn->full_feature) && + (conn->sess->ErrorRecoveryLevel >= 1) && + spdk_iscsi_is_deferred_free_pdu(pdu)) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "stat_sn=%d\n", + from_be32(&pdu->bhs.stat_sn)); + TAILQ_INSERT_TAIL(&conn->snack_pdu_list, pdu, + tailq); + } else { + spdk_iscsi_conn_free_pdu(conn, pdu); + } + + pdu = TAILQ_FIRST(&conn->write_pdu_list); + } else { + pdu->writev_offset += bytes; + bytes = 0; + } + } + + return TAILQ_EMPTY(&conn->write_pdu_list) ? 0 : 1; +} + +/** + * \brief Flushes response PDUs back to the initiator. + * + * This function may return without all PDUs having flushed to the + * underlying TCP socket buffer - for example, in the case where the + * socket buffer is already full. + * + * During normal RUNNING connection state, if not all PDUs are flushed, + * then subsequent calls to this routine will eventually flush + * remaining PDUs. + * + * During other connection states (EXITING or LOGGED_OUT), this + * function will spin until all PDUs have successfully been flushed. + * + * Returns 0 for success and when all PDUs were able to be flushed. + * + * Returns 1 for success but when some PDUs could not be flushed due + * to lack of TCP buffer space. + * + * Returns -1 for an exceptional error indicating the TCP connection + * should be closed. + */ +static int +spdk_iscsi_conn_flush_pdus(void *_conn) +{ + struct spdk_iscsi_conn *conn = _conn; + int rc; + + if (conn->state == ISCSI_CONN_STATE_RUNNING) { + rc = spdk_iscsi_conn_flush_pdus_internal(conn); + if (rc == 0 && conn->flush_poller != NULL) { + spdk_poller_unregister(&conn->flush_poller); + } else if (rc == 1 && conn->flush_poller == NULL) { + conn->flush_poller = spdk_poller_register(spdk_iscsi_conn_flush_pdus, conn, 50); + } + } else { + /* + * If the connection state is not RUNNING, then + * keep trying to flush PDUs until our list is + * empty - to make sure all data is sent before + * closing the connection. + */ + do { + rc = spdk_iscsi_conn_flush_pdus_internal(conn); + } while (rc == 1); + } + + if (rc < 0 && conn->state < ISCSI_CONN_STATE_EXITING) { + /* + * If the poller has already started destruction of the connection, + * i.e. the socket read failed, then the connection state may already + * be EXITED. We don't want to set it back to EXITING in that case. + */ + conn->state = ISCSI_CONN_STATE_EXITING; + } + + return -1; +} + +void +spdk_iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + TAILQ_INSERT_TAIL(&conn->write_pdu_list, pdu, tailq); + spdk_iscsi_conn_flush_pdus(conn); +} + +#define GET_PDU_LOOP_COUNT 16 + +static int +spdk_iscsi_conn_handle_incoming_pdus(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *pdu; + int i, rc; + + /* Read new PDUs from network */ + for (i = 0; i < GET_PDU_LOOP_COUNT; i++) { + rc = spdk_iscsi_read_pdu(conn, &pdu); + if (rc == 0) { + break; + } else if (rc == SPDK_ISCSI_CONNECTION_FATAL) { + return rc; + } + + if (conn->state == ISCSI_CONN_STATE_LOGGED_OUT) { + SPDK_ERRLOG("pdu received after logout\n"); + spdk_put_pdu(pdu); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + rc = spdk_iscsi_execute(conn, pdu); + spdk_put_pdu(pdu); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_execute() fatal error on %s(%s)\n", + conn->target_port != NULL ? spdk_scsi_port_get_name(conn->target_port) : "NULL", + conn->initiator_port != NULL ? spdk_scsi_port_get_name(conn->initiator_port) : "NULL"); + return rc; + } + + spdk_trace_record(TRACE_ISCSI_TASK_EXECUTED, 0, 0, (uintptr_t)pdu, 0); + if (conn->is_stopped) { + break; + } + } + + return i; +} + +static void +spdk_iscsi_conn_sock_cb(void *arg, struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_iscsi_conn *conn = arg; + int rc; + + assert(conn != NULL); + + if ((conn->state == ISCSI_CONN_STATE_EXITED) || + (conn->state == ISCSI_CONN_STATE_EXITING)) { + return; + } + + /* Handle incoming PDUs */ + rc = spdk_iscsi_conn_handle_incoming_pdus(conn); + if (rc < 0) { + conn->state = ISCSI_CONN_STATE_EXITING; + spdk_iscsi_conn_flush_pdus(conn); + } +} + +static void +spdk_iscsi_conn_full_feature_migrate(void *arg1, void *arg2) +{ + struct spdk_iscsi_conn *conn = arg1; + + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { + spdk_iscsi_conn_open_luns(conn); + } + + /* The poller has been unregistered, so now we can re-register it on the new core. */ + conn->lcore = spdk_env_get_current_core(); + spdk_iscsi_poll_group_add_conn(conn); +} + +void +spdk_iscsi_conn_migration(struct spdk_iscsi_conn *conn) +{ + int lcore; + struct spdk_event *event; + struct spdk_iscsi_tgt_node *target; + + lcore = spdk_iscsi_conn_allocate_reactor(conn->portal->cpumask); + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { + target = conn->sess->target; + pthread_mutex_lock(&target->mutex); + target->num_active_conns++; + if (target->num_active_conns == 1) { + /** + * This is the only active connection for this target node. + * Save the lcore in the target node so it can be used for + * any other connections to this target node. + */ + target->lcore = lcore; + } else { + /** + * There are other active connections for this target node. + * Ignore the lcore specified by the allocator and use the + * the target node's lcore to ensure this connection runs on + * the same lcore as other connections for this target node. + */ + lcore = target->lcore; + } + pthread_mutex_unlock(&target->mutex); + } + + spdk_iscsi_poll_group_remove_conn_sock(conn); + spdk_poller_unregister(&conn->flush_poller); + spdk_iscsi_conn_stop(conn); + + __sync_fetch_and_add(&g_num_connections[lcore], 1); + conn->last_nopin = spdk_get_ticks(); + event = spdk_event_allocate(lcore, spdk_iscsi_conn_full_feature_migrate, + conn, NULL); + spdk_event_call(event); +} + +void +spdk_iscsi_conn_set_min_per_core(int count) +{ + g_connections_per_lcore = count; +} + +int +spdk_iscsi_conn_get_min_per_core(void) +{ + return g_connections_per_lcore; +} + +static uint32_t +spdk_iscsi_conn_allocate_reactor(const struct spdk_cpuset *cpumask) +{ + uint32_t i, selected_core; + int32_t num_pollers, min_pollers; + + min_pollers = INT_MAX; + selected_core = spdk_env_get_first_core(); + + SPDK_ENV_FOREACH_CORE(i) { + if (!spdk_cpuset_get_cpu(cpumask, i)) { + continue; + } + + /* This core is running. Check how many pollers it already has. */ + num_pollers = g_num_connections[i]; + + if ((num_pollers > 0) && (num_pollers < g_connections_per_lcore)) { + /* Fewer than the maximum connections per core, + * but at least 1. Use this core. + */ + return i; + } else if (num_pollers < min_pollers) { + /* Track the core that has the minimum number of pollers + * to be used if no cores meet our criteria + */ + selected_core = i; + min_pollers = num_pollers; + } + } + + return selected_core; +} + +static int +logout_timeout(void *arg) +{ + struct spdk_iscsi_conn *conn = arg; + + spdk_iscsi_conn_destruct(conn); + + return -1; +} + +void +spdk_iscsi_conn_logout(struct spdk_iscsi_conn *conn) +{ + conn->state = ISCSI_CONN_STATE_LOGGED_OUT; + conn->logout_timer = spdk_poller_register(logout_timeout, conn, ISCSI_LOGOUT_TIMEOUT * 1000000); +} + +SPDK_TRACE_REGISTER_FN(iscsi_conn_trace) +{ + spdk_trace_register_owner(OWNER_ISCSI_CONN, 'c'); + spdk_trace_register_object(OBJECT_ISCSI_PDU, 'p'); + spdk_trace_register_description("ISCSI_READ_FROM_SOCKET_DONE", "", + TRACE_ISCSI_READ_FROM_SOCKET_DONE, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("ISCSI_FLUSH_WRITEBUF_START", "", TRACE_ISCSI_FLUSH_WRITEBUF_START, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, "iovec: "); + spdk_trace_register_description("ISCSI_FLUSH_WRITEBUF_DONE", "", TRACE_ISCSI_FLUSH_WRITEBUF_DONE, + OWNER_ISCSI_CONN, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("ISCSI_READ_PDU", "", TRACE_ISCSI_READ_PDU, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 1, 0, "opc: "); + spdk_trace_register_description("ISCSI_TASK_DONE", "", TRACE_ISCSI_TASK_DONE, + OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 0, 0, ""); + spdk_trace_register_description("ISCSI_TASK_QUEUE", "", TRACE_ISCSI_TASK_QUEUE, + OWNER_ISCSI_CONN, OBJECT_SCSI_TASK, 1, 1, "pdu: "); + spdk_trace_register_description("ISCSI_TASK_EXECUTED", "", TRACE_ISCSI_TASK_EXECUTED, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, ""); + spdk_trace_register_description("ISCSI_PDU_COMPLETED", "", TRACE_ISCSI_PDU_COMPLETED, + OWNER_ISCSI_CONN, OBJECT_ISCSI_PDU, 0, 0, ""); +} diff --git a/src/spdk/lib/iscsi/conn.h b/src/spdk/lib/iscsi/conn.h new file mode 100644 index 00000000..4a91e698 --- /dev/null +++ b/src/spdk/lib/iscsi/conn.h @@ -0,0 +1,193 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_CONN_H +#define SPDK_ISCSI_CONN_H + +#include "spdk/stdinc.h" + +#include "iscsi/iscsi.h" +#include "spdk/queue.h" +#include "spdk/cpuset.h" +#include "spdk/scsi.h" + +/* + * MAX_CONNECTION_PARAMS: The numbers of the params in conn_param_table + * MAX_SESSION_PARAMS: The numbers of the params in sess_param_table + */ +#define MAX_CONNECTION_PARAMS 14 +#define MAX_SESSION_PARAMS 19 + +#define MAX_ADDRBUF 64 +#define MAX_INITIATOR_ADDR (MAX_ADDRBUF) +#define MAX_TARGET_ADDR (MAX_ADDRBUF) + +#define OWNER_ISCSI_CONN 0x1 + +#define OBJECT_ISCSI_PDU 0x1 + +#define TRACE_GROUP_ISCSI 0x1 +#define TRACE_ISCSI_READ_FROM_SOCKET_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x0) +#define TRACE_ISCSI_FLUSH_WRITEBUF_START SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x1) +#define TRACE_ISCSI_FLUSH_WRITEBUF_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x2) +#define TRACE_ISCSI_READ_PDU SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x3) +#define TRACE_ISCSI_TASK_DONE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x4) +#define TRACE_ISCSI_TASK_QUEUE SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x5) +#define TRACE_ISCSI_TASK_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x6) +#define TRACE_ISCSI_PDU_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_ISCSI, 0x7) + +struct spdk_poller; + +struct spdk_iscsi_conn { + int id; + int is_valid; + /* + * All fields below this point are reinitialized each time the + * connection object is allocated. Make sure to update the + * SPDK_ISCSI_CONNECTION_MEMSET() macro if changing which fields + * are initialized when allocated. + */ + struct spdk_iscsi_portal *portal; + int pg_tag; + char *portal_host; + char *portal_port; + struct spdk_cpuset *portal_cpumask; + uint32_t lcore; + struct spdk_sock *sock; + struct spdk_iscsi_sess *sess; + + enum iscsi_connection_state state; + int login_phase; + + uint64_t last_flush; + uint64_t last_fill; + uint64_t last_nopin; + + /* Timer used to destroy connection after logout if initiator does + * not close the connection. + */ + struct spdk_poller *logout_timer; + + /* Timer used to wait for connection to close + */ + struct spdk_poller *shutdown_timer; + + struct spdk_iscsi_pdu *pdu_in_progress; + + TAILQ_HEAD(, spdk_iscsi_pdu) write_pdu_list; + TAILQ_HEAD(, spdk_iscsi_pdu) snack_pdu_list; + + int pending_r2t; + struct spdk_iscsi_task *outstanding_r2t_tasks[DEFAULT_MAXR2T]; + + uint16_t cid; + + /* IP address */ + char initiator_addr[MAX_INITIATOR_ADDR]; + char target_addr[MAX_TARGET_ADDR]; + + /* Initiator/Target port binds */ + char initiator_name[MAX_INITIATOR_NAME]; + struct spdk_scsi_port *initiator_port; + char target_short_name[MAX_TARGET_NAME]; + struct spdk_scsi_port *target_port; + struct spdk_iscsi_tgt_node *target; + struct spdk_scsi_dev *dev; + + /* for fast access */ + int header_digest; + int data_digest; + int full_feature; + + struct iscsi_param *params; + bool sess_param_state_negotiated[MAX_SESSION_PARAMS]; + bool conn_param_state_negotiated[MAX_CONNECTION_PARAMS]; + struct iscsi_chap_auth auth; + int authenticated; + int req_auth; + int req_mutual; + uint32_t pending_task_cnt; + uint32_t data_out_cnt; + uint32_t data_in_cnt; + bool pending_activate_event; + + int timeout; + uint64_t nopininterval; + bool nop_outstanding; + + /* + * This is the maximum data segment length that iscsi target can send + * to the initiator on this connection. Not to be confused with the + * maximum data segment length that initiators can send to iscsi target, which + * is statically defined as SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH. + */ + int MaxRecvDataSegmentLength; + + uint32_t StatSN; + uint32_t exp_statsn; + uint32_t ttt; /* target transfer tag */ + char *partial_text_parameter; + + STAILQ_ENTRY(spdk_iscsi_conn) link; + struct spdk_poller *flush_poller; + bool is_stopped; /* Set true when connection is stopped for migration */ + TAILQ_HEAD(queued_r2t_tasks, spdk_iscsi_task) queued_r2t_tasks; + TAILQ_HEAD(active_r2t_tasks, spdk_iscsi_task) active_r2t_tasks; + TAILQ_HEAD(queued_datain_tasks, spdk_iscsi_task) queued_datain_tasks; + + struct spdk_scsi_desc *open_lun_descs[SPDK_SCSI_DEV_MAX_LUN]; +}; + +extern struct spdk_iscsi_conn *g_conns_array; + +int spdk_initialize_iscsi_conns(void); +void spdk_shutdown_iscsi_conns(void); + +int spdk_iscsi_conn_construct(struct spdk_iscsi_portal *portal, struct spdk_sock *sock); +void spdk_iscsi_conn_destruct(struct spdk_iscsi_conn *conn); +void spdk_iscsi_conn_handle_nop(struct spdk_iscsi_conn *conn); +void spdk_iscsi_conn_migration(struct spdk_iscsi_conn *conn); +void spdk_iscsi_conn_logout(struct spdk_iscsi_conn *conn); +int spdk_iscsi_drop_conns(struct spdk_iscsi_conn *conn, + const char *conn_match, int drop_all); +void spdk_iscsi_conn_set_min_per_core(int count); +int spdk_iscsi_conn_get_min_per_core(void); + +int spdk_iscsi_conn_read_data(struct spdk_iscsi_conn *conn, int len, + void *buf); +void spdk_iscsi_conn_write_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu); + +void spdk_iscsi_conn_free_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu); + +#endif /* SPDK_ISCSI_CONN_H */ diff --git a/src/spdk/lib/iscsi/init_grp.c b/src/spdk/lib/iscsi/init_grp.c new file mode 100644 index 00000000..33b7bfc3 --- /dev/null +++ b/src/spdk/lib/iscsi/init_grp.c @@ -0,0 +1,786 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/init_grp.h" + +static struct spdk_iscsi_init_grp * +spdk_iscsi_init_grp_create(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + ig = calloc(1, sizeof(*ig)); + if (ig == NULL) { + SPDK_ERRLOG("calloc() failed for initiator group\n"); + return NULL; + } + + ig->tag = tag; + TAILQ_INIT(&ig->initiator_head); + TAILQ_INIT(&ig->netmask_head); + return ig; +} + +static struct spdk_iscsi_initiator_name * +spdk_iscsi_init_grp_find_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + if (!strcmp(iname->name, name)) { + return iname; + } + } + return NULL; +} + +static int +spdk_iscsi_init_grp_add_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + char *p; + + if (ig->ninitiators >= MAX_INITIATOR) { + SPDK_ERRLOG("> MAX_INITIATOR(=%d) is not allowed\n", MAX_INITIATOR); + return -EPERM; + } + + iname = spdk_iscsi_init_grp_find_initiator(ig, name); + if (iname != NULL) { + return -EEXIST; + } + + iname = malloc(sizeof(*iname)); + if (iname == NULL) { + SPDK_ERRLOG("malloc() failed for initiator name str\n"); + return -ENOMEM; + } + + iname->name = strdup(name); + if (iname->name == NULL) { + SPDK_ERRLOG("strdup() failed for initiator name\n"); + free(iname); + return -ENOMEM; + } + + /* Replace "ALL" by "ANY" if set */ + p = strstr(iname->name, "ALL"); + if (p != NULL) { + SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL"); + SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY"); + memcpy(p, "ANY", 3); + } + + TAILQ_INSERT_TAIL(&ig->initiator_head, iname, tailq); + ig->ninitiators++; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", name); + return 0; +} + +static int +spdk_iscsi_init_grp_delete_initiator(struct spdk_iscsi_init_grp *ig, char *name) +{ + struct spdk_iscsi_initiator_name *iname; + + iname = spdk_iscsi_init_grp_find_initiator(ig, name); + if (iname == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&ig->initiator_head, iname, tailq); + ig->ninitiators--; + free(iname->name); + free(iname); + return 0; +} + +static int +spdk_iscsi_init_grp_add_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, char **inames) +{ + int i; + int rc; + + for (i = 0; i < num_inames; i++) { + rc = spdk_iscsi_init_grp_add_initiator(ig, inames[i]); + if (rc < 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + spdk_iscsi_init_grp_delete_initiator(ig, inames[i - 1]); + } + return rc; +} + +static void +spdk_iscsi_init_grp_delete_all_initiators(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_initiator_name *iname, *tmp; + + TAILQ_FOREACH_SAFE(iname, &ig->initiator_head, tailq, tmp) { + TAILQ_REMOVE(&ig->initiator_head, iname, tailq); + ig->ninitiators--; + free(iname->name); + free(iname); + } +} + +static int +spdk_iscsi_init_grp_delete_initiators(struct spdk_iscsi_init_grp *ig, int num_inames, char **inames) +{ + int i; + int rc; + + for (i = 0; i < num_inames; i++) { + rc = spdk_iscsi_init_grp_delete_initiator(ig, inames[i]); + if (rc < 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + rc = spdk_iscsi_init_grp_add_initiator(ig, inames[i - 1]); + if (rc != 0) { + spdk_iscsi_init_grp_delete_all_initiators(ig); + break; + } + } + return -1; +} + +static struct spdk_iscsi_initiator_netmask * +spdk_iscsi_init_grp_find_netmask(struct spdk_iscsi_init_grp *ig, const char *mask) +{ + struct spdk_iscsi_initiator_netmask *netmask; + + TAILQ_FOREACH(netmask, &ig->netmask_head, tailq) { + if (!strcmp(netmask->mask, mask)) { + return netmask; + } + } + return NULL; +} + +static int +spdk_iscsi_init_grp_add_netmask(struct spdk_iscsi_init_grp *ig, char *mask) +{ + struct spdk_iscsi_initiator_netmask *imask; + char *p; + + if (ig->nnetmasks >= MAX_NETMASK) { + SPDK_ERRLOG("> MAX_NETMASK(=%d) is not allowed\n", MAX_NETMASK); + return -EPERM; + } + + imask = spdk_iscsi_init_grp_find_netmask(ig, mask); + if (imask != NULL) { + return -EEXIST; + } + + imask = malloc(sizeof(*imask)); + if (imask == NULL) { + SPDK_ERRLOG("malloc() failed for inititator mask str\n"); + return -ENOMEM; + } + + imask->mask = strdup(mask); + if (imask->mask == NULL) { + SPDK_ERRLOG("strdup() failed for initiator mask\n"); + free(imask); + return -ENOMEM; + } + + /* Replace "ALL" by "ANY" if set */ + p = strstr(imask->mask, "ALL"); + if (p != NULL) { + SPDK_WARNLOG("Please use \"%s\" instead of \"%s\"\n", "ANY", "ALL"); + SPDK_WARNLOG("Converting \"%s\" to \"%s\" automatically\n", "ALL", "ANY"); + memcpy(p, "ANY", 3); + } + + TAILQ_INSERT_TAIL(&ig->netmask_head, imask, tailq); + ig->nnetmasks++; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", mask); + return 0; +} + +static int +spdk_iscsi_init_grp_delete_netmask(struct spdk_iscsi_init_grp *ig, char *mask) +{ + struct spdk_iscsi_initiator_netmask *imask; + + imask = spdk_iscsi_init_grp_find_netmask(ig, mask); + if (imask == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&ig->netmask_head, imask, tailq); + ig->nnetmasks--; + free(imask->mask); + free(imask); + return 0; +} + +static int +spdk_iscsi_init_grp_add_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks) +{ + int i; + int rc; + + for (i = 0; i < num_imasks; i++) { + rc = spdk_iscsi_init_grp_add_netmask(ig, imasks[i]); + if (rc != 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + spdk_iscsi_init_grp_delete_netmask(ig, imasks[i - 1]); + } + return rc; +} + +static void +spdk_iscsi_init_grp_delete_all_netmasks(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_initiator_netmask *imask, *tmp; + + TAILQ_FOREACH_SAFE(imask, &ig->netmask_head, tailq, tmp) { + TAILQ_REMOVE(&ig->netmask_head, imask, tailq); + ig->nnetmasks--; + free(imask->mask); + free(imask); + } +} + +static int +spdk_iscsi_init_grp_delete_netmasks(struct spdk_iscsi_init_grp *ig, int num_imasks, char **imasks) +{ + int i; + int rc; + + for (i = 0; i < num_imasks; i++) { + rc = spdk_iscsi_init_grp_delete_netmask(ig, imasks[i]); + if (rc != 0) { + goto cleanup; + } + } + return 0; + +cleanup: + for (; i > 0; --i) { + rc = spdk_iscsi_init_grp_add_netmask(ig, imasks[i - 1]); + if (rc != 0) { + spdk_iscsi_init_grp_delete_all_netmasks(ig); + break; + } + } + return -1; +} + +/* Read spdk iscsi target's config file and create initiator group */ +static int +spdk_iscsi_parse_init_grp(struct spdk_conf_section *sp) +{ + int i, rc = 0; + const char *val = NULL; + int num_initiator_names; + int num_initiator_masks; + char **initiators = NULL, **netmasks = NULL; + int tag = spdk_conf_section_get_num(sp); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add initiator group %d\n", tag); + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + /* counts number of definitions */ + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "InitiatorName", i); + if (val == NULL) { + break; + } + } + if (i == 0) { + SPDK_ERRLOG("num_initiator_names = 0\n"); + return -EINVAL; + } + num_initiator_names = i; + if (num_initiator_names > MAX_INITIATOR) { + SPDK_ERRLOG("%d > MAX_INITIATOR\n", num_initiator_names); + return -E2BIG; + } + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Netmask", i); + if (val == NULL) { + break; + } + } + if (i == 0) { + SPDK_ERRLOG("num_initiator_mask = 0\n"); + return -EINVAL; + } + num_initiator_masks = i; + if (num_initiator_masks > MAX_NETMASK) { + SPDK_ERRLOG("%d > MAX_NETMASK\n", num_initiator_masks); + return -E2BIG; + } + + initiators = calloc(num_initiator_names, sizeof(char *)); + if (!initiators) { + SPDK_ERRLOG("calloc() failed for temp initiator name array\n"); + return -ENOMEM; + } + for (i = 0; i < num_initiator_names; i++) { + val = spdk_conf_section_get_nval(sp, "InitiatorName", i); + if (!val) { + SPDK_ERRLOG("InitiatorName %d not found\n", i); + rc = -EINVAL; + goto cleanup; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "InitiatorName %s\n", val); + initiators[i] = strdup(val); + if (!initiators[i]) { + SPDK_ERRLOG("strdup() failed for temp initiator name\n"); + rc = -ENOMEM; + goto cleanup; + } + } + netmasks = calloc(num_initiator_masks, sizeof(char *)); + if (!netmasks) { + SPDK_ERRLOG("malloc() failed for portal group\n"); + rc = -ENOMEM; + goto cleanup; + } + for (i = 0; i < num_initiator_masks; i++) { + val = spdk_conf_section_get_nval(sp, "Netmask", i); + if (!val) { + SPDK_ERRLOG("Netmask %d not found\n", i); + rc = -EINVAL; + goto cleanup; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Netmask %s\n", val); + netmasks[i] = strdup(val); + if (!netmasks[i]) { + SPDK_ERRLOG("strdup() failed for temp initiator mask\n"); + rc = -ENOMEM; + goto cleanup; + } + } + + rc = spdk_iscsi_init_grp_create_from_initiator_list(tag, + num_initiator_names, initiators, num_initiator_masks, netmasks); + +cleanup: + if (initiators) { + for (i = 0; i < num_initiator_names; i++) { + if (initiators[i]) { + free(initiators[i]); + } + } + free(initiators); + } + if (netmasks) { + for (i = 0; i < num_initiator_masks; i++) { + if (netmasks[i]) { + free(netmasks[i]); + } + } + free(netmasks); + } + return rc; +} + +int +spdk_iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_init_grp *tmp; + int rc = -1; + + assert(ig != NULL); + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + tmp = spdk_iscsi_init_grp_find_by_tag(ig->tag); + if (tmp == NULL) { + TAILQ_INSERT_TAIL(&g_spdk_iscsi.ig_head, ig, tailq); + rc = 0; + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return rc; +} + +/* + * Create initiator group from list of initiator ip/hostnames and netmasks + * The initiator hostname/netmask lists are allocated by the caller on the + * heap. Freed later by common initiator_group_destroy() code + */ +int +spdk_iscsi_init_grp_create_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "add initiator group (from initiator list) tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + ig = spdk_iscsi_init_grp_create(tag); + if (!ig) { + SPDK_ERRLOG("initiator group create error (%d)\n", tag); + return rc; + } + + rc = spdk_iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("add initiator name error\n"); + goto cleanup; + } + + rc = spdk_iscsi_init_grp_add_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("add initiator netmask error\n"); + goto cleanup; + } + + rc = spdk_iscsi_init_grp_register(ig); + if (rc < 0) { + SPDK_ERRLOG("initiator group register error (%d)\n", tag); + goto cleanup; + } + return 0; + +cleanup: + spdk_iscsi_init_grp_destroy(ig); + return rc; +} + +int +spdk_iscsi_init_grp_add_initiators_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "add initiator to initiator group: tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + ig = spdk_iscsi_init_grp_find_by_tag(tag); + if (!ig) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + SPDK_ERRLOG("initiator group (%d) is not found\n", tag); + return rc; + } + + rc = spdk_iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("add initiator name error\n"); + goto error; + } + + rc = spdk_iscsi_init_grp_add_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("add initiator netmask error\n"); + spdk_iscsi_init_grp_delete_initiators(ig, num_initiator_names, + initiator_names); + } + +error: + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return rc; +} + +int +spdk_iscsi_init_grp_delete_initiators_from_initiator_list(int tag, + int num_initiator_names, + char **initiator_names, + int num_initiator_masks, + char **initiator_masks) +{ + int rc = -1; + struct spdk_iscsi_init_grp *ig; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "delete initiator from initiator group: tag=%d, #initiators=%d, #masks=%d\n", + tag, num_initiator_names, num_initiator_masks); + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + ig = spdk_iscsi_init_grp_find_by_tag(tag); + if (!ig) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + SPDK_ERRLOG("initiator group (%d) is not found\n", tag); + return rc; + } + + rc = spdk_iscsi_init_grp_delete_initiators(ig, num_initiator_names, + initiator_names); + if (rc < 0) { + SPDK_ERRLOG("delete initiator name error\n"); + goto error; + } + + rc = spdk_iscsi_init_grp_delete_netmasks(ig, num_initiator_masks, + initiator_masks); + if (rc < 0) { + SPDK_ERRLOG("delete initiator netmask error\n"); + spdk_iscsi_init_grp_add_initiators(ig, num_initiator_names, + initiator_names); + goto error; + } + +error: + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return rc; +} + +void +spdk_iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig) +{ + if (!ig) { + return; + } + + spdk_iscsi_init_grp_delete_all_initiators(ig); + spdk_iscsi_init_grp_delete_all_netmasks(ig); + free(ig); +}; + +struct spdk_iscsi_init_grp * +spdk_iscsi_init_grp_find_by_tag(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_spdk_iscsi.ig_head, tailq) { + if (ig->tag == tag) { + return ig; + } + } + + return NULL; +} + +int +spdk_iscsi_parse_init_grps(void) +{ + struct spdk_conf_section *sp; + int rc; + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "InitiatorGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + return -1; + } + rc = spdk_iscsi_parse_init_grp(sp); + if (rc < 0) { + SPDK_ERRLOG("parse_init_group() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +spdk_iscsi_init_grps_destroy(void) +{ + struct spdk_iscsi_init_grp *ig, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_init_grp_array_destroy\n"); + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH_SAFE(ig, &g_spdk_iscsi.ig_head, tailq, tmp) { + TAILQ_REMOVE(&g_spdk_iscsi.ig_head, ig, tailq); + spdk_iscsi_init_grp_destroy(ig); + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); +} + +struct spdk_iscsi_init_grp * +spdk_iscsi_init_grp_unregister(int tag) +{ + struct spdk_iscsi_init_grp *ig; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH(ig, &g_spdk_iscsi.ig_head, tailq) { + if (ig->tag == tag) { + TAILQ_REMOVE(&g_spdk_iscsi.ig_head, ig, tailq); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return ig; + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return NULL; +} + +static const char *initiator_group_section = \ + "\n" + "# Users must change the InitiatorGroup section(s) to match the IP\n" + "# addresses and initiator configuration in their environment.\n" + "# Netmask can be used to specify a single IP address or a range of IP addresses\n" + "# Netmask 192.168.1.20 <== single IP address\n" + "# Netmask 192.168.1.0/24 <== IP range 192.168.1.*\n"; + +#define INITIATOR_GROUP_TMPL \ +"[InitiatorGroup%d]\n" \ +" Comment \"Initiator Group%d\"\n" + +#define INITIATOR_TMPL \ +" InitiatorName " + +#define NETMASK_TMPL \ +" Netmask " + +void +spdk_iscsi_init_grps_config_text(FILE *fp) +{ + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_initiator_name *iname; + struct spdk_iscsi_initiator_netmask *imask; + + /* Create initiator group section */ + fprintf(fp, "%s", initiator_group_section); + + /* Dump initiator groups */ + TAILQ_FOREACH(ig, &g_spdk_iscsi.ig_head, tailq) { + if (NULL == ig) { continue; } + fprintf(fp, INITIATOR_GROUP_TMPL, ig->tag, ig->tag); + + /* Dump initiators */ + fprintf(fp, INITIATOR_TMPL); + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + fprintf(fp, "%s ", iname->name); + } + fprintf(fp, "\n"); + + /* Dump netmasks */ + fprintf(fp, NETMASK_TMPL); + TAILQ_FOREACH(imask, &ig->netmask_head, tailq) { + fprintf(fp, "%s ", imask->mask); + } + fprintf(fp, "\n"); + } +} + +static void +spdk_iscsi_init_grp_info_json(struct spdk_iscsi_init_grp *ig, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_initiator_name *iname; + struct spdk_iscsi_initiator_netmask *imask; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", ig->tag); + + spdk_json_write_named_array_begin(w, "initiators"); + TAILQ_FOREACH(iname, &ig->initiator_head, tailq) { + spdk_json_write_string(w, iname->name); + } + spdk_json_write_array_end(w); + + spdk_json_write_named_array_begin(w, "netmasks"); + TAILQ_FOREACH(imask, &ig->netmask_head, tailq) { + spdk_json_write_string(w, imask->mask); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +spdk_iscsi_init_grp_config_json(struct spdk_iscsi_init_grp *ig, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "add_initiator_group"); + + spdk_json_write_name(w, "params"); + spdk_iscsi_init_grp_info_json(ig, w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_init_grps_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_spdk_iscsi.ig_head, tailq) { + spdk_iscsi_init_grp_info_json(ig, w); + } +} + +void +spdk_iscsi_init_grps_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_init_grp *ig; + + TAILQ_FOREACH(ig, &g_spdk_iscsi.ig_head, tailq) { + spdk_iscsi_init_grp_config_json(ig, w); + } +} diff --git a/src/spdk/lib/iscsi/init_grp.h b/src/spdk/lib/iscsi/init_grp.h new file mode 100644 index 00000000..ff24ee5b --- /dev/null +++ b/src/spdk/lib/iscsi/init_grp.h @@ -0,0 +1,79 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_INIT_GRP_H +#define SPDK_INIT_GRP_H + +#include "spdk/conf.h" + +struct spdk_iscsi_initiator_name { + char *name; + TAILQ_ENTRY(spdk_iscsi_initiator_name) tailq; +}; + +struct spdk_iscsi_initiator_netmask { + char *mask; + TAILQ_ENTRY(spdk_iscsi_initiator_netmask) tailq; +}; + +struct spdk_iscsi_init_grp { + int ninitiators; + TAILQ_HEAD(, spdk_iscsi_initiator_name) initiator_head; + int nnetmasks; + TAILQ_HEAD(, spdk_iscsi_initiator_netmask) netmask_head; + int ref; + int tag; + TAILQ_ENTRY(spdk_iscsi_init_grp) tailq; +}; + +/* SPDK iSCSI Initiator Group management API */ +int spdk_iscsi_init_grp_create_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int spdk_iscsi_init_grp_add_initiators_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int spdk_iscsi_init_grp_delete_initiators_from_initiator_list(int tag, + int num_initiator_names, char **initiator_names, + int num_initiator_masks, char **initiator_masks); +int spdk_iscsi_init_grp_register(struct spdk_iscsi_init_grp *ig); +struct spdk_iscsi_init_grp *spdk_iscsi_init_grp_unregister(int tag); +struct spdk_iscsi_init_grp *spdk_iscsi_init_grp_find_by_tag(int tag); +void spdk_iscsi_init_grp_destroy(struct spdk_iscsi_init_grp *ig); +int spdk_iscsi_parse_init_grps(void); +void spdk_iscsi_init_grps_destroy(void); +void spdk_iscsi_init_grps_config_text(FILE *fp); +void spdk_iscsi_init_grps_info_json(struct spdk_json_write_ctx *w); +void spdk_iscsi_init_grps_config_json(struct spdk_json_write_ctx *w); +#endif // SPDK_INIT_GRP_H diff --git a/src/spdk/lib/iscsi/iscsi.c b/src/spdk/lib/iscsi/iscsi.c new file mode 100644 index 00000000..7d96c9cb --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi.c @@ -0,0 +1,4583 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/trace.h" +#include "spdk/string.h" +#include "spdk/queue.h" +#include "spdk/net.h" + +#include "iscsi/md5.h" +#include "iscsi/iscsi.h" +#include "iscsi/param.h" +#include "iscsi/tgt_node.h" +#include "iscsi/task.h" +#include "iscsi/conn.h" +#include "spdk/scsi.h" +#include "spdk/bdev.h" +#include "iscsi/portal_grp.h" +#include "iscsi/acceptor.h" + +#include "spdk_internal/log.h" + +#define MAX_TMPBUF 1024 + +#define SPDK_CRC32C_INITIAL 0xffffffffUL +#define SPDK_CRC32C_XOR 0xffffffffUL + +#ifdef __FreeBSD__ +#define HAVE_SRANDOMDEV 1 +#define HAVE_ARC4RANDOM 1 +#endif + +struct spdk_iscsi_globals g_spdk_iscsi = { + .mutex = PTHREAD_MUTEX_INITIALIZER, + .portal_head = TAILQ_HEAD_INITIALIZER(g_spdk_iscsi.portal_head), + .pg_head = TAILQ_HEAD_INITIALIZER(g_spdk_iscsi.pg_head), + .ig_head = TAILQ_HEAD_INITIALIZER(g_spdk_iscsi.ig_head), + .target_head = TAILQ_HEAD_INITIALIZER(g_spdk_iscsi.target_head), + .auth_group_head = TAILQ_HEAD_INITIALIZER(g_spdk_iscsi.auth_group_head), +}; + +/* random value generation */ +static void spdk_gen_random(uint8_t *buf, size_t len); +#ifndef HAVE_SRANDOMDEV +static void srandomdev(void); +#endif /* HAVE_SRANDOMDEV */ +#ifndef HAVE_ARC4RANDOM +//static uint32_t arc4random(void); +#endif /* HAVE_ARC4RANDOM */ + +/* convert from/to bin/hex */ +static int spdk_bin2hex(char *buf, size_t len, const uint8_t *data, size_t data_len); +static int spdk_hex2bin(uint8_t *data, size_t data_len, const char *str); + +static int spdk_add_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task); + +static int spdk_iscsi_send_r2t(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, int offset, + int len, uint32_t transfer_tag, uint32_t *R2TSN); +static int spdk_iscsi_send_r2t_recovery(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *r2t_task, uint32_t r2t_sn, + bool send_new_r2tsn); + +static int spdk_create_iscsi_sess(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, enum session_type session_type); +static int spdk_append_iscsi_sess(struct spdk_iscsi_conn *conn, + const char *initiator_port_name, uint16_t tsih, uint16_t cid); + +static void spdk_remove_acked_pdu(struct spdk_iscsi_conn *conn, uint32_t ExpStatSN); + +static int spdk_iscsi_reject(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu, + int reason); + +#define DMIN32(A,B) ((uint32_t) ((uint32_t)(A) > (uint32_t)(B) ? (uint32_t)(B) : (uint32_t)(A))) +#define DMIN64(A,B) ((uint64_t) ((A) > (B) ? (B) : (A))) + +#define MATCH_DIGEST_WORD(BUF, CRC32C) \ + ( ((((uint32_t) *((uint8_t *)(BUF)+0)) << 0) \ + | (((uint32_t) *((uint8_t *)(BUF)+1)) << 8) \ + | (((uint32_t) *((uint8_t *)(BUF)+2)) << 16) \ + | (((uint32_t) *((uint8_t *)(BUF)+3)) << 24)) \ + == (CRC32C)) + +#define MAKE_DIGEST_WORD(BUF, CRC32C) \ + ( ((*((uint8_t *)(BUF)+0)) = (uint8_t)((uint32_t)(CRC32C) >> 0)), \ + ((*((uint8_t *)(BUF)+1)) = (uint8_t)((uint32_t)(CRC32C) >> 8)), \ + ((*((uint8_t *)(BUF)+2)) = (uint8_t)((uint32_t)(CRC32C) >> 16)), \ + ((*((uint8_t *)(BUF)+3)) = (uint8_t)((uint32_t)(CRC32C) >> 24))) + +#if 0 +static int +spdk_match_digest_word(const uint8_t *buf, uint32_t crc32c) +{ + uint32_t l; + + l = (buf[0] & 0xffU) << 0; + l |= (buf[1] & 0xffU) << 8; + l |= (buf[2] & 0xffU) << 16; + l |= (buf[3] & 0xffU) << 24; + return (l == crc32c); +} + +static uint8_t * +spdk_make_digest_word(uint8_t *buf, size_t len, uint32_t crc32c) +{ + if (len < ISCSI_DIGEST_LEN) { + return NULL; + } + + buf[0] = (crc32c >> 0) & 0xffU; + buf[1] = (crc32c >> 8) & 0xffU; + buf[2] = (crc32c >> 16) & 0xffU; + buf[3] = (crc32c >> 24) & 0xffU; + return buf; +} +#endif + +#ifndef HAVE_SRANDOMDEV +static void +srandomdev(void) +{ + unsigned long seed; + time_t now; + pid_t pid; + + pid = getpid(); + now = time(NULL); + seed = pid ^ now; + srandom(seed); +} +#endif /* HAVE_SRANDOMDEV */ + +#ifndef HAVE_ARC4RANDOM +static int spdk_arc4random_initialized = 0; + +static uint32_t +arc4random(void) +{ + uint32_t r; + uint32_t r1, r2; + + if (!spdk_arc4random_initialized) { + srandomdev(); + spdk_arc4random_initialized = 1; + } + r1 = (uint32_t)(random() & 0xffff); + r2 = (uint32_t)(random() & 0xffff); + r = (r1 << 16) | r2; + return r; +} +#endif /* HAVE_ARC4RANDOM */ + +static void +spdk_gen_random(uint8_t *buf, size_t len) +{ +#ifdef USE_RANDOM + long l; + size_t idx; + + srandomdev(); + for (idx = 0; idx < len; idx++) { + l = random(); + buf[idx] = (uint8_t) l; + } +#else + uint32_t r; + size_t idx; + + for (idx = 0; idx < len; idx++) { + r = arc4random(); + buf[idx] = (uint8_t) r; + } +#endif /* USE_RANDOM */ +} + +static uint64_t +spdk_iscsi_get_isid(const uint8_t isid[6]) +{ + return (uint64_t)isid[0] << 40 | + (uint64_t)isid[1] << 32 | + (uint64_t)isid[2] << 24 | + (uint64_t)isid[3] << 16 | + (uint64_t)isid[4] << 8 | + (uint64_t)isid[5]; +} + +static int +spdk_bin2hex(char *buf, size_t len, const uint8_t *data, size_t data_len) +{ + const char *digits = "0123456789ABCDEF"; + size_t total = 0; + size_t idx; + + if (len < 3) { + return -1; + } + buf[total] = '0'; + total++; + buf[total] = 'x'; + total++; + buf[total] = '\0'; + + for (idx = 0; idx < data_len; idx++) { + if (total + 3 > len) { + buf[total] = '\0'; + return - 1; + } + buf[total] = digits[(data[idx] >> 4) & 0x0fU]; + total++; + buf[total] = digits[data[idx] & 0x0fU]; + total++; + } + buf[total] = '\0'; + return total; +} + +static int +spdk_hex2bin(uint8_t *data, size_t data_len, const char *str) +{ + const char *digits = "0123456789ABCDEF"; + const char *dp; + const char *p; + size_t total = 0; + int n0, n1; + + p = str; + if (p[0] != '0' && (p[1] != 'x' && p[1] != 'X')) { + return -1; + } + p += 2; + + while (p[0] != '\0' && p[1] != '\0') { + if (total >= data_len) { + return -1; + } + dp = strchr(digits, toupper((int) p[0])); + if (dp == NULL) { + return -1; + } + n0 = (int)(dp - digits); + dp = strchr(digits, toupper((int) p[1])); + if (dp == NULL) { + return -1; + } + n1 = (int)(dp - digits); + + data[total] = (uint8_t)(((n0 & 0x0fU) << 4) | (n1 & 0x0fU)); + total++; + p += 2; + } + return total; +} + +static int +spdk_islun2lun(uint64_t islun) +{ + uint64_t fmt_lun; + uint64_t method; + int lun_i; + + fmt_lun = islun; + method = (fmt_lun >> 62) & 0x03U; + fmt_lun = fmt_lun >> 48; + if (method == 0x00U) { + lun_i = (int)(fmt_lun & 0x00ffU); + } else if (method == 0x01U) { + lun_i = (int)(fmt_lun & 0x3fffU); + } else { + lun_i = 0xffffU; + } + return lun_i; +} + +static uint32_t +spdk_iscsi_pdu_calc_header_digest(struct spdk_iscsi_pdu *pdu) +{ + uint32_t crc32c; + uint32_t ahs_len_bytes = pdu->bhs.total_ahs_len * 4; + + crc32c = SPDK_CRC32C_INITIAL; + crc32c = spdk_crc32c_update(&pdu->bhs, ISCSI_BHS_LEN, crc32c); + + if (ahs_len_bytes) { + crc32c = spdk_crc32c_update(pdu->ahs, ahs_len_bytes, crc32c); + } + + /* BHS and AHS are always 4-byte multiples in length, so no padding is necessary. */ + crc32c = crc32c ^ SPDK_CRC32C_XOR; + return crc32c; +} + +static uint32_t +spdk_iscsi_pdu_calc_data_digest(struct spdk_iscsi_pdu *pdu) +{ + uint32_t data_len = DGET24(pdu->bhs.data_segment_len); + uint32_t crc32c; + uint32_t mod; + + crc32c = SPDK_CRC32C_INITIAL; + crc32c = spdk_crc32c_update(pdu->data, data_len, crc32c); + + mod = data_len % ISCSI_ALIGNMENT; + if (mod != 0) { + uint32_t pad_length = ISCSI_ALIGNMENT - mod; + uint8_t pad[3] = {0, 0, 0}; + + assert(pad_length > 0); + assert(pad_length <= sizeof(pad)); + crc32c = spdk_crc32c_update(pad, pad_length, crc32c); + } + + crc32c = crc32c ^ SPDK_CRC32C_XOR; + return crc32c; +} + +int +spdk_iscsi_read_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu **_pdu) +{ + struct spdk_iscsi_pdu *pdu; + struct spdk_mempool *pool; + uint32_t crc32c; + int ahs_len; + int data_len; + int max_segment_len; + int rc; + + if (conn->pdu_in_progress == NULL) { + conn->pdu_in_progress = spdk_get_pdu(); + if (conn->pdu_in_progress == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + pdu = conn->pdu_in_progress; + + if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) { + rc = spdk_iscsi_conn_read_data(conn, + ISCSI_BHS_LEN - pdu->bhs_valid_bytes, + (uint8_t *)&pdu->bhs + pdu->bhs_valid_bytes); + if (rc < 0) { + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return rc; + } + pdu->bhs_valid_bytes += rc; + if (pdu->bhs_valid_bytes < ISCSI_BHS_LEN) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + } + + data_len = ISCSI_ALIGN(DGET24(pdu->bhs.data_segment_len)); + + /* AHS */ + ahs_len = pdu->bhs.total_ahs_len * 4; + assert(ahs_len <= ISCSI_AHS_LEN); + if (pdu->ahs_valid_bytes < ahs_len) { + rc = spdk_iscsi_conn_read_data(conn, + ahs_len - pdu->ahs_valid_bytes, + pdu->ahs + pdu->ahs_valid_bytes); + if (rc < 0) { + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return rc; + } + + pdu->ahs_valid_bytes += rc; + if (pdu->ahs_valid_bytes < ahs_len) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + } + + /* Header Digest */ + if (conn->header_digest && + pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) { + rc = spdk_iscsi_conn_read_data(conn, + ISCSI_DIGEST_LEN - pdu->hdigest_valid_bytes, + pdu->header_digest + pdu->hdigest_valid_bytes); + if (rc < 0) { + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return rc; + } + + pdu->hdigest_valid_bytes += rc; + if (pdu->hdigest_valid_bytes < ISCSI_DIGEST_LEN) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + } + + /* copy the actual data into local buffer */ + if (pdu->data_valid_bytes < data_len) { + if (pdu->data_buf == NULL) { + if (data_len <= spdk_get_immediate_data_buffer_size()) { + pool = g_spdk_iscsi.pdu_immediate_data_pool; + } else if (data_len <= spdk_get_data_out_buffer_size()) { + pool = g_spdk_iscsi.pdu_data_out_pool; + } else { + SPDK_ERRLOG("Data(%d) > MaxSegment(%d)\n", + data_len, spdk_get_data_out_buffer_size()); + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return SPDK_ISCSI_CONNECTION_FATAL; + } + pdu->mobj = spdk_mempool_get(pool); + if (pdu->mobj == NULL) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + pdu->data_buf = pdu->mobj->buf; + } + + rc = spdk_iscsi_conn_read_data(conn, + data_len - pdu->data_valid_bytes, + pdu->data_buf + pdu->data_valid_bytes); + if (rc < 0) { + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return rc; + } + + pdu->data_valid_bytes += rc; + if (pdu->data_valid_bytes < data_len) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + } + + /* copy out the data digest */ + if (conn->data_digest && data_len != 0 && + pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) { + rc = spdk_iscsi_conn_read_data(conn, + ISCSI_DIGEST_LEN - pdu->ddigest_valid_bytes, + pdu->data_digest + pdu->ddigest_valid_bytes); + if (rc < 0) { + *_pdu = NULL; + spdk_put_pdu(pdu); + conn->pdu_in_progress = NULL; + return rc; + } + + pdu->ddigest_valid_bytes += rc; + if (pdu->ddigest_valid_bytes < ISCSI_DIGEST_LEN) { + *_pdu = NULL; + return SPDK_SUCCESS; + } + } + + /* All data for this PDU has now been read from the socket. */ + conn->pdu_in_progress = NULL; + + spdk_trace_record(TRACE_ISCSI_READ_PDU, conn->id, pdu->data_valid_bytes, + (uintptr_t)pdu, pdu->bhs.opcode); + + /* Data Segment */ + if (data_len != 0) { + /* + * Determine the maximum segment length expected for this PDU. + * This will be used to make sure the initiator did not send + * us too much immediate data. + * + * This value is specified separately by the initiator and target, + * and not negotiated. So we can use the #define safely here, + * since the value is not dependent on the initiator's maximum + * segment lengths (FirstBurstLength/MaxRecvDataSegmentLength), + * and SPDK currently does not allow configuration of these values + * at runtime. + */ + if (conn->sess == NULL) { + /* + * If the connection does not yet have a session, then + * login is not complete and we use the 8KB default + * FirstBurstLength as our maximum data segment length + * value. + */ + max_segment_len = DEFAULT_FIRSTBURSTLENGTH; + } else if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAOUT) { + max_segment_len = spdk_get_data_out_buffer_size(); + } else if (pdu->bhs.opcode == ISCSI_OP_NOPOUT) { + max_segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + } else { + max_segment_len = spdk_get_immediate_data_buffer_size(); + } + if (data_len > max_segment_len) { + SPDK_ERRLOG("Data(%d) > MaxSegment(%d)\n", data_len, max_segment_len); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + spdk_put_pdu(pdu); + + /* + * If spdk_iscsi_reject() was not able to reject the PDU, + * treat it as a fatal connection error. Otherwise, + * return SUCCESS here so that the caller will continue + * to attempt to read PDUs. + */ + rc = (rc < 0) ? SPDK_ISCSI_CONNECTION_FATAL : SPDK_SUCCESS; + return rc; + } + + pdu->data = pdu->data_buf; + pdu->data_from_mempool = true; + pdu->data_segment_len = data_len; + } + + /* check digest */ + if (conn->header_digest) { + crc32c = spdk_iscsi_pdu_calc_header_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->header_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("header digest error (%s)\n", conn->initiator_name); + spdk_put_pdu(pdu); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + if (conn->data_digest && data_len != 0) { + crc32c = spdk_iscsi_pdu_calc_data_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("data digest error (%s)\n", conn->initiator_name); + spdk_put_pdu(pdu); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + *_pdu = pdu; + return 1; +} + +int +spdk_iscsi_build_iovecs(struct spdk_iscsi_conn *conn, struct iovec *iovec, + struct spdk_iscsi_pdu *pdu) +{ + int iovec_cnt = 0; + uint32_t crc32c; + int enable_digest; + int total_ahs_len; + int data_len; + + total_ahs_len = pdu->bhs.total_ahs_len; + data_len = DGET24(pdu->bhs.data_segment_len); + + enable_digest = 1; + if (pdu->bhs.opcode == ISCSI_OP_LOGIN_RSP) { + /* this PDU should be sent without digest */ + enable_digest = 0; + } + + /* BHS */ + iovec[iovec_cnt].iov_base = &pdu->bhs; + iovec[iovec_cnt].iov_len = ISCSI_BHS_LEN; + iovec_cnt++; + + /* AHS */ + if (total_ahs_len > 0) { + iovec[iovec_cnt].iov_base = pdu->ahs; + iovec[iovec_cnt].iov_len = 4 * total_ahs_len; + iovec_cnt++; + } + + /* Header Digest */ + if (enable_digest && conn->header_digest) { + crc32c = spdk_iscsi_pdu_calc_header_digest(pdu); + MAKE_DIGEST_WORD(pdu->header_digest, crc32c); + + iovec[iovec_cnt].iov_base = pdu->header_digest; + iovec[iovec_cnt].iov_len = ISCSI_DIGEST_LEN; + iovec_cnt++; + } + + /* Data Segment */ + if (data_len > 0) { + iovec[iovec_cnt].iov_base = pdu->data; + iovec[iovec_cnt].iov_len = ISCSI_ALIGN(data_len); + iovec_cnt++; + } + + /* Data Digest */ + if (enable_digest && conn->data_digest && data_len != 0) { + crc32c = spdk_iscsi_pdu_calc_data_digest(pdu); + MAKE_DIGEST_WORD(pdu->data_digest, crc32c); + + iovec[iovec_cnt].iov_base = pdu->data_digest; + iovec[iovec_cnt].iov_len = ISCSI_DIGEST_LEN; + iovec_cnt++; + } + + return iovec_cnt; +} + +static int +spdk_iscsi_append_text(struct spdk_iscsi_conn *conn __attribute__((__unused__)), + const char *key, const char *val, uint8_t *data, + int alloc_len, int data_len) +{ + int total; + int len; + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + return total; + } + len = snprintf((char *) data + total, alloc_len - total, "%s=%s", key, val); + total += len + 1; + + return total; +} + +static int +spdk_iscsi_append_param(struct spdk_iscsi_conn *conn, const char *key, + uint8_t *data, int alloc_len, int data_len) +{ + struct iscsi_param *param; + int rc; + + param = spdk_iscsi_param_find(conn->params, key); + if (param == NULL) { + param = spdk_iscsi_param_find(conn->sess->params, key); + if (param == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "no key %.64s\n", key); + return data_len; + } + } + rc = spdk_iscsi_append_text(conn, param->key, param->val, data, + alloc_len, data_len); + return rc; +} + +static int +spdk_iscsi_get_authinfo(struct spdk_iscsi_conn *conn, const char *authuser) +{ + int ag_tag; + int rc; + + if (conn->sess->target != NULL) { + ag_tag = conn->sess->target->chap_group; + } else { + ag_tag = -1; + } + if (ag_tag < 0) { + ag_tag = g_spdk_iscsi.chap_group; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ag_tag=%d\n", ag_tag); + + rc = spdk_iscsi_chap_get_authinfo(&conn->auth, authuser, ag_tag); + if (rc < 0) { + SPDK_ERRLOG("chap_get_authinfo() failed\n"); + return -1; + } + return 0; +} + +static int +spdk_iscsi_auth_params(struct spdk_iscsi_conn *conn, + struct iscsi_param *params, const char *method, uint8_t *data, + int alloc_len, int data_len) +{ + char *in_val; + char *in_next; + char *new_val; + const char *val; + const char *user; + const char *response; + const char *challenge; + int total; + int rc; + + if (conn == NULL || params == NULL || method == NULL) { + return -1; + } + if (strcasecmp(method, "CHAP") == 0) { + /* method OK */ + } else { + SPDK_ERRLOG("unsupported AuthMethod %.64s\n", method); + return -1; + } + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + /* for temporary store */ + in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!in_val) { + SPDK_ERRLOG("malloc() failed for temporary store\n"); + return -ENOMEM; + } + + /* CHAP method (RFC1994) */ + if ((val = spdk_iscsi_param_get_val(params, "CHAP_A")) != NULL) { + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_A) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + + /* CHAP_A is LIST type */ + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", val); + in_next = in_val; + while ((new_val = spdk_strsepq(&in_next, ",")) != NULL) { + if (strcasecmp(new_val, "5") == 0) { + /* CHAP with MD5 */ + break; + } + } + if (new_val == NULL) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject"); + new_val = in_val; + spdk_iscsi_append_text(conn, "CHAP_A", new_val, + data, alloc_len, total); + goto error_return; + } + /* selected algorithm is 5 (MD5) */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_A=%s\n", new_val); + total = spdk_iscsi_append_text(conn, "CHAP_A", new_val, + data, alloc_len, total); + + /* Identifier is one octet */ + spdk_gen_random(conn->auth.chap_id, 1); + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + (int) conn->auth.chap_id[0]); + total = spdk_iscsi_append_text(conn, "CHAP_I", in_val, + data, alloc_len, total); + + /* Challenge Value is a variable stream of octets */ + /* (binary length MUST not exceed 1024 bytes) */ + conn->auth.chap_challenge_len = ISCSI_CHAP_CHALLENGE_LEN; + spdk_gen_random(conn->auth.chap_challenge, + conn->auth.chap_challenge_len); + spdk_bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, + conn->auth.chap_challenge, + conn->auth.chap_challenge_len); + total = spdk_iscsi_append_text(conn, "CHAP_C", in_val, + data, alloc_len, total); + + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_NR; + } else if ((val = spdk_iscsi_param_get_val(params, "CHAP_N")) != NULL) { + uint8_t resmd5[SPDK_MD5DIGEST_LEN]; + uint8_t tgtmd5[SPDK_MD5DIGEST_LEN]; + struct spdk_md5ctx md5ctx; + + user = val; + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_WAIT_NR) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + + response = spdk_iscsi_param_get_val(params, "CHAP_R"); + if (response == NULL) { + SPDK_ERRLOG("no response\n"); + goto error_return; + } + rc = spdk_hex2bin(resmd5, SPDK_MD5DIGEST_LEN, response); + if (rc < 0 || rc != SPDK_MD5DIGEST_LEN) { + SPDK_ERRLOG("response format error\n"); + goto error_return; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_N/CHAP_R\n"); + + rc = spdk_iscsi_get_authinfo(conn, val); + if (rc < 0) { + //SPDK_ERRLOG("auth user or secret is missing\n"); + SPDK_ERRLOG("iscsi_get_authinfo() failed\n"); + goto error_return; + } + if (conn->auth.user[0] == '\0' || conn->auth.secret[0] == '\0') { + //SPDK_ERRLOG("auth user or secret is missing\n"); + SPDK_ERRLOG("auth failed (user %.64s)\n", user); + goto error_return; + } + + spdk_md5init(&md5ctx); + /* Identifier */ + spdk_md5update(&md5ctx, conn->auth.chap_id, 1); + /* followed by secret */ + spdk_md5update(&md5ctx, conn->auth.secret, + strlen(conn->auth.secret)); + /* followed by Challenge Value */ + spdk_md5update(&md5ctx, conn->auth.chap_challenge, + conn->auth.chap_challenge_len); + /* tgtmd5 is expecting Response Value */ + spdk_md5final(tgtmd5, &md5ctx); + + spdk_bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, + tgtmd5, SPDK_MD5DIGEST_LEN); + +#if 0 + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "tgtmd5=%s, resmd5=%s\n", in_val, response); + spdk_dump("tgtmd5", tgtmd5, SPDK_MD5DIGEST_LEN); + spdk_dump("resmd5", resmd5, SPDK_MD5DIGEST_LEN); +#endif + + /* compare MD5 digest */ + if (memcmp(tgtmd5, resmd5, SPDK_MD5DIGEST_LEN) != 0) { + /* not match */ + //SPDK_ERRLOG("auth user or secret is missing\n"); + SPDK_ERRLOG("auth failed (user %.64s)\n", user); + goto error_return; + } + /* OK initiator's secret */ + conn->authenticated = 1; + + /* mutual CHAP? */ + val = spdk_iscsi_param_get_val(params, "CHAP_I"); + if (val != NULL) { + conn->auth.chap_mid[0] = (uint8_t) strtol(val, NULL, 10); + challenge = spdk_iscsi_param_get_val(params, "CHAP_C"); + if (challenge == NULL) { + SPDK_ERRLOG("CHAP sequence error\n"); + goto error_return; + } + rc = spdk_hex2bin(conn->auth.chap_mchallenge, + ISCSI_CHAP_CHALLENGE_LEN, + challenge); + if (rc < 0) { + SPDK_ERRLOG("challenge format error\n"); + goto error_return; + } + conn->auth.chap_mchallenge_len = rc; +#if 0 + spdk_dump("MChallenge", conn->auth.chap_mchallenge, + conn->auth.chap_mchallenge_len); +#endif + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got CHAP_I/CHAP_C\n"); + + if (conn->auth.muser[0] == '\0' || conn->auth.msecret[0] == '\0') { + //SPDK_ERRLOG("mutual auth user or secret is missing\n"); + SPDK_ERRLOG("auth failed (user %.64s)\n", user); + goto error_return; + } + + spdk_md5init(&md5ctx); + /* Identifier */ + spdk_md5update(&md5ctx, conn->auth.chap_mid, 1); + /* followed by secret */ + spdk_md5update(&md5ctx, conn->auth.msecret, + strlen(conn->auth.msecret)); + /* followed by Challenge Value */ + spdk_md5update(&md5ctx, conn->auth.chap_mchallenge, + conn->auth.chap_mchallenge_len); + /* tgtmd5 is Response Value */ + spdk_md5final(tgtmd5, &md5ctx); + + spdk_bin2hex(in_val, ISCSI_TEXT_MAX_VAL_LEN, + tgtmd5, SPDK_MD5DIGEST_LEN); + + total = spdk_iscsi_append_text(conn, "CHAP_N", + conn->auth.muser, data, alloc_len, total); + total = spdk_iscsi_append_text(conn, "CHAP_R", + in_val, data, alloc_len, total); + } else { + /* not mutual */ + if (conn->req_mutual) { + SPDK_ERRLOG("required mutual CHAP\n"); + goto error_return; + } + } + + conn->auth.chap_phase = ISCSI_CHAP_PHASE_END; + } else { + /* not found CHAP keys */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "start CHAP\n"); + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + } + + free(in_val); + return total; + +error_return: + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + free(in_val); + return -1; +} + +static int +spdk_iscsi_reject(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu, + int reason) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_reject *rsph; + uint8_t *data; + int total_ahs_len; + int data_len; + int alloc_len; + + total_ahs_len = pdu->bhs.total_ahs_len; + data_len = 0; + alloc_len = ISCSI_BHS_LEN + (4 * total_ahs_len); + + if (conn->header_digest) { + alloc_len += ISCSI_DIGEST_LEN; + } + + data = calloc(1, alloc_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Reject PDU reason=%d\n", reason); + + if (conn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u\n", conn->StatSN); + } + + memcpy(data, &pdu->bhs, ISCSI_BHS_LEN); + data_len += ISCSI_BHS_LEN; + + if (total_ahs_len != 0) { + memcpy(data + data_len, pdu->ahs, (4 * total_ahs_len)); + data_len += (4 * total_ahs_len); + } + + if (conn->header_digest) { + memcpy(data + data_len, pdu->header_digest, ISCSI_DIGEST_LEN); + data_len += ISCSI_DIGEST_LEN; + } + + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + free(data); + return -ENOMEM; + } + + rsph = (struct iscsi_bhs_reject *)&rsp_pdu->bhs; + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_REJECT; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + rsph->reason = reason; + DSET24(rsph->data_segment_len, data_len); + + rsph->ffffffff = 0xffffffffU; + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess != NULL) { + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->exp_cmd_sn, 1); + to_be32(&rsph->max_cmd_sn, 1); + } + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "PDU", (void *)&rsp_pdu->bhs, ISCSI_BHS_LEN); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + return 0; +} + +static int +spdk_iscsi_check_values(struct spdk_iscsi_conn *conn) +{ + if (conn->sess->FirstBurstLength > conn->sess->MaxBurstLength) { + SPDK_ERRLOG("FirstBurstLength(%d) > MaxBurstLength(%d)\n", + conn->sess->FirstBurstLength, + conn->sess->MaxBurstLength); + return -1; + } + if (conn->sess->FirstBurstLength > g_spdk_iscsi.FirstBurstLength) { + SPDK_ERRLOG("FirstBurstLength(%d) > iSCSI target restriction(%d)\n", + conn->sess->FirstBurstLength, g_spdk_iscsi.FirstBurstLength); + return -1; + } + if (conn->sess->MaxBurstLength > 0x00ffffff) { + SPDK_ERRLOG("MaxBurstLength(%d) > 0x00ffffff\n", + conn->sess->MaxBurstLength); + return -1; + } + + if (conn->MaxRecvDataSegmentLength < 512) { + SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) < 512\n", + conn->MaxRecvDataSegmentLength); + return -1; + } + if (conn->MaxRecvDataSegmentLength > 0x00ffffff) { + SPDK_ERRLOG("MaxRecvDataSegmentLength(%d) > 0x00ffffff\n", + conn->MaxRecvDataSegmentLength); + return -1; + } + return 0; +} + +/* + * The response function of spdk_iscsi_op_login + * return: + * 0:success; + * -1:error; + */ +static int +spdk_iscsi_op_login_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param *params) +{ + struct iscsi_bhs_login_rsp *rsph; + int rc; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->version_max = ISCSI_VERSION; + rsph->version_act = ISCSI_VERSION; + DSET24(rsph->data_segment_len, rsp_pdu->data_segment_len); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess != NULL) { + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->exp_cmd_sn, rsp_pdu->cmd_sn); + to_be32(&rsph->max_cmd_sn, rsp_pdu->cmd_sn); + } + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)rsph, ISCSI_BHS_LEN); + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "DATA", rsp_pdu->data, rsp_pdu->data_segment_len); + + /* Set T/CSG/NSG to reserved if login error. */ + if (rsph->status_class != 0) { + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; + rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; + } + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + /* after send PDU digest on/off */ + if (conn->full_feature) { + /* update internal variables */ + rc = spdk_iscsi_copy_param2var(conn); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_copy_param2var() failed\n"); + spdk_iscsi_param_free(params); + return -1; + } + /* check value */ + rc = spdk_iscsi_check_values(conn); + if (rc < 0) { + SPDK_ERRLOG("iscsi_check_values() failed\n"); + spdk_iscsi_param_free(params); + return -1; + } + } + + spdk_iscsi_param_free(params); + return 0; +} + +/* + * This function is used to del the original param and update it with new + * value + * return: + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_update_param(struct spdk_iscsi_conn *conn, + const char *key, const char *value, + const char *list) +{ + int rc = 0; + struct iscsi_param *new_param, *orig_param; + int index; + + orig_param = spdk_iscsi_param_find(conn->params, key); + if (orig_param == NULL) { + SPDK_ERRLOG("orig_param %s not found\n", key); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + index = orig_param->state_index; + rc = spdk_iscsi_param_del(&conn->params, key); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_del(%s) failed\n", key); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + rc = spdk_iscsi_param_add(&conn->params, key, value, list, ISPT_LIST); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + new_param = spdk_iscsi_param_find(conn->params, key); + if (new_param == NULL) { + SPDK_ERRLOG("spdk_iscsi_param_find() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + new_param->state_index = index; + return rc; +} + +/* + * The function which is used to handle the part of session discovery + * return: + * 0, success; + * otherwise: error; + */ +static int +spdk_iscsi_op_login_session_discovery_chap(struct spdk_iscsi_conn *conn) +{ + int rc = 0; + + if (g_spdk_iscsi.disable_chap) { + conn->req_auth = 0; + rc = spdk_iscsi_op_login_update_param(conn, "AuthMethod", "None", "None"); + if (rc < 0) { + return rc; + } + } else if (g_spdk_iscsi.require_chap) { + conn->req_auth = 1; + rc = spdk_iscsi_op_login_update_param(conn, "AuthMethod", "CHAP", "CHAP"); + if (rc < 0) { + return rc; + } + } + if (g_spdk_iscsi.mutual_chap) { + conn->req_mutual = 1; + } + + return rc; +} + +/* + * This function is used to update the param related with chap + * return: + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_negotiate_chap_param(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + struct spdk_iscsi_tgt_node *target) +{ + int rc; + + if (target->disable_chap) { + conn->req_auth = 0; + rc = spdk_iscsi_op_login_update_param(conn, "AuthMethod", "None", "None"); + if (rc < 0) { + return rc; + } + } else if (target->require_chap) { + conn->req_auth = 1; + rc = spdk_iscsi_op_login_update_param(conn, "AuthMethod", "CHAP", "CHAP"); + if (rc < 0) { + return rc; + } + } + + if (target->mutual_chap) { + conn->req_mutual = 1; + } + + if (target->header_digest) { + /* + * User specified header digests, so update the list of + * HeaderDigest values to remove "None" so that only + * initiators who support CRC32C can connect. + */ + rc = spdk_iscsi_op_login_update_param(conn, "HeaderDigest", "CRC32C", "CRC32C"); + if (rc < 0) { + return rc; + } + } + + if (target->data_digest) { + /* + * User specified data digests, so update the list of + * DataDigest values to remove "None" so that only + * initiators who support CRC32C can connect. + */ + rc = spdk_iscsi_op_login_update_param(conn, "DataDigest", "CRC32C", "CRC32C"); + if (rc < 0) { + return rc; + } + } + + return 0; +} + +/* + * This function use to check the session + * return: + * 0, success + * otherwise: error + */ +static int +spdk_iscsi_op_login_check_session(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, int cid) + +{ + int rc = 0; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + /* check existing session */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "isid=%"PRIx64", tsih=%u, cid=%u\n", + spdk_iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), cid); + if (rsph->tsih != 0) { + /* multiple connections */ + rc = spdk_append_iscsi_sess(conn, initiator_port_name, + from_be16(&rsph->tsih), cid); + if (rc < 0) { + SPDK_ERRLOG("isid=%"PRIx64", tsih=%u, cid=%u:" + "spdk_append_iscsi_sess() failed\n", + spdk_iscsi_get_isid(rsph->isid), from_be16(&rsph->tsih), + cid); + /* Can't include in session */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_CONN_ADD_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } else if (!g_spdk_iscsi.AllowDuplicateIsid) { + /* new session, drop old sess by the initiator */ + spdk_iscsi_drop_conns(conn, initiator_port_name, 0 /* drop old */); + } + + return rc; +} + +/* + * This function is used to check the target info + * return: + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_check_target(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + const char *target_name, + struct spdk_iscsi_tgt_node **target) +{ + bool result; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + *target = spdk_iscsi_find_tgt_node(target_name); + if (*target == NULL) { + SPDK_WARNLOG("target %s not found\n", target_name); + /* Not found */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_TARGET_NOT_FOUND; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + result = spdk_iscsi_tgt_node_access(conn, *target, + conn->initiator_name, + conn->initiator_addr); + if (!result) { + SPDK_ERRLOG("access denied\n"); + /* Not found */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_TARGET_NOT_FOUND; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * The function which is used to handle the part of normal login session + * return: + * 0, success; + * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error; + */ +static int +spdk_iscsi_op_login_session_normal(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + struct iscsi_param *params, + struct spdk_iscsi_tgt_node **target, + int cid) +{ + const char *target_name; + const char *target_short_name; + struct iscsi_bhs_login_rsp *rsph; + int rc = 0; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + target_name = spdk_iscsi_param_get_val(params, "TargetName"); + + if (target_name == NULL) { + SPDK_ERRLOG("TargetName is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + memset(conn->target_short_name, 0, MAX_TARGET_NAME); + target_short_name = strstr(target_name, ":"); + if (target_short_name != NULL) { + target_short_name++; /* Advance past the ':' */ + if (strlen(target_short_name) >= MAX_TARGET_NAME) { + SPDK_ERRLOG("Target Short Name (%s) is more than %u characters\n", + target_short_name, MAX_TARGET_NAME); + return rc; + } + snprintf(conn->target_short_name, MAX_TARGET_NAME, "%s", + target_short_name); + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + rc = spdk_iscsi_op_login_check_target(conn, rsp_pdu, target_name, target); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + if (rc < 0) { + return rc; + } + + conn->target = *target; + conn->dev = (*target)->dev; + conn->target_port = spdk_scsi_dev_find_port_by_id((*target)->dev, + conn->portal->group->tag); + + rc = spdk_iscsi_op_login_check_session(conn, rsp_pdu, + initiator_port_name, cid); + if (rc < 0) { + return rc; + } + + /* force target flags */ + pthread_mutex_lock(&((*target)->mutex)); + rc = spdk_iscsi_op_login_negotiate_chap_param(conn, rsp_pdu, *target); + pthread_mutex_unlock(&((*target)->mutex)); + + return rc; +} + +/* + * This function is used to judge the session type + * return + * 0: success + * otherwise, error + */ +static int +spdk_iscsi_op_login_session_type(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + enum session_type *session_type, + struct iscsi_param *params) +{ + const char *session_type_str; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + session_type_str = spdk_iscsi_param_get_val(params, "SessionType"); + if (session_type_str == NULL) { + if (rsph->tsih != 0) { + *session_type = SESSION_TYPE_NORMAL; + } else { + SPDK_ERRLOG("SessionType is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } else { + if (strcasecmp(session_type_str, "Discovery") == 0) { + *session_type = SESSION_TYPE_DISCOVERY; + } else if (strcasecmp(session_type_str, "Normal") == 0) { + *session_type = SESSION_TYPE_NORMAL; + } else { + *session_type = SESSION_TYPE_INVALID; + SPDK_ERRLOG("SessionType is invalid\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Session Type: %s\n", session_type_str); + + return 0; +} +/* + * This function is used to initialize the port info + * return + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_initialize_port(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + uint32_t name_length, + struct iscsi_param *params) +{ + const char *val; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + /* Initiator Name and Port */ + val = spdk_iscsi_param_get_val(params, "InitiatorName"); + if (val == NULL) { + SPDK_ERRLOG("InitiatorName is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + snprintf(conn->initiator_name, sizeof(conn->initiator_name), "%s", val); + snprintf(initiator_port_name, name_length, + "%s,i,0x%12.12" PRIx64, val, spdk_iscsi_get_isid(rsph->isid)); + spdk_strlwr(conn->initiator_name); + spdk_strlwr(initiator_port_name); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator name: %s\n", conn->initiator_name); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Initiator port: %s\n", initiator_port_name); + + return 0; +} + +/* + * This function is used to set the info in the connection data structure + * return + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_set_conn_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + char *initiator_port_name, + enum session_type session_type, + struct spdk_iscsi_tgt_node *target, int cid) +{ + int rc = 0; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + conn->authenticated = 0; + conn->auth.chap_phase = ISCSI_CHAP_PHASE_WAIT_A; + conn->cid = cid; + + if (conn->sess == NULL) { + /* new session */ + rc = spdk_create_iscsi_sess(conn, target, session_type); + if (rc < 0) { + SPDK_ERRLOG("create_sess() failed\n"); + rsph->status_class = ISCSI_CLASS_TARGET_ERROR; + rsph->status_detail = ISCSI_LOGIN_STATUS_NO_RESOURCES; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + /* initialize parameters */ + conn->StatSN = from_be32(&rsph->stat_sn); + conn->sess->initiator_port = spdk_scsi_port_create(spdk_iscsi_get_isid(rsph->isid), + 0, initiator_port_name); + conn->sess->isid = spdk_iscsi_get_isid(rsph->isid); + conn->sess->target = target; + + /* Discovery sessions will not have a target. */ + if (target != NULL) { + conn->sess->queue_depth = target->queue_depth; + } else { + /* + * Assume discovery sessions have an effective command + * windows size of 1. + */ + conn->sess->queue_depth = 1; + } + conn->sess->ExpCmdSN = rsp_pdu->cmd_sn; + conn->sess->MaxCmdSN = rsp_pdu->cmd_sn + conn->sess->queue_depth - 1; + } + + conn->initiator_port = conn->sess->initiator_port; + + return 0; +} + +/* + * This function is used to set the target info + * return + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_set_target_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + enum session_type session_type, + int alloc_len, + struct spdk_iscsi_tgt_node *target) +{ + char buf[MAX_TMPBUF]; + const char *val; + int rc = 0; + struct spdk_iscsi_portal *portal = conn->portal; + + /* declarative parameters */ + if (target != NULL) { + pthread_mutex_lock(&target->mutex); + if (target->alias != NULL) { + snprintf(buf, sizeof buf, "%s", target->alias); + } else { + snprintf(buf, sizeof buf, "%s", ""); + } + pthread_mutex_unlock(&target->mutex); + rc = spdk_iscsi_param_set(conn->sess->params, "TargetAlias", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + } + snprintf(buf, sizeof buf, "%s:%s,%d", portal->host, portal->port, + portal->group->tag); + rc = spdk_iscsi_param_set(conn->sess->params, "TargetAddress", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + snprintf(buf, sizeof buf, "%d", portal->group->tag); + rc = spdk_iscsi_param_set(conn->sess->params, "TargetPortalGroupTag", buf); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + /* write in response */ + if (target != NULL) { + val = spdk_iscsi_param_get_val(conn->sess->params, "TargetAlias"); + if (val != NULL && strlen(val) != 0) { + rsp_pdu->data_segment_len = spdk_iscsi_append_param(conn, + "TargetAlias", + rsp_pdu->data, + alloc_len, + rsp_pdu->data_segment_len); + } + if (session_type == SESSION_TYPE_DISCOVERY) { + rsp_pdu->data_segment_len = spdk_iscsi_append_param(conn, + "TargetAddress", + rsp_pdu->data, + alloc_len, + rsp_pdu->data_segment_len); + } + rsp_pdu->data_segment_len = spdk_iscsi_append_param(conn, + "TargetPortalGroupTag", + rsp_pdu->data, + alloc_len, + rsp_pdu->data_segment_len); + } + + return rc; +} + +/* + * This function is used to handle the login of iscsi initiator when there is + * no session + * return: + * 0, success; + * SPDK_ISCSI_LOGIN_ERROR_PARAMETER, parameter error; + * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify the login fail. + */ +static int +spdk_iscsi_op_login_phase_none(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param *params, + int alloc_len, int cid) +{ + enum session_type session_type; + char initiator_port_name[MAX_INITIATOR_NAME]; + struct iscsi_bhs_login_rsp *rsph; + struct spdk_iscsi_tgt_node *target = NULL; + int rc = 0; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + conn->target = NULL; + conn->dev = NULL; + + rc = spdk_iscsi_op_login_initialize_port(conn, rsp_pdu, + initiator_port_name, MAX_INITIATOR_NAME, params); + if (rc < 0) { + return rc; + } + + rc = spdk_iscsi_op_login_session_type(conn, rsp_pdu, &session_type, + params); + if (rc < 0) { + return rc; + } + + /* Target Name and Port */ + if (session_type == SESSION_TYPE_NORMAL) { + rc = spdk_iscsi_op_login_session_normal(conn, rsp_pdu, + initiator_port_name, + params, &target, cid); + if (rc < 0) { + return rc; + } + + } else if (session_type == SESSION_TYPE_DISCOVERY) { + target = NULL; + rsph->tsih = 0; + + /* force target flags */ + pthread_mutex_lock(&g_spdk_iscsi.mutex); + rc = spdk_iscsi_op_login_session_discovery_chap(conn); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + if (rc < 0) { + return rc; + } + } else { + SPDK_ERRLOG("unknown session type\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + rc = spdk_iscsi_op_login_set_conn_info(conn, rsp_pdu, initiator_port_name, + session_type, target, cid); + if (rc < 0) { + return rc; + } + + /* limit conns on discovery session */ + if (session_type == SESSION_TYPE_DISCOVERY) { + conn->sess->MaxConnections = 1; + rc = spdk_iscsi_param_set_int(conn->sess->params, + "MaxConnections", + conn->sess->MaxConnections); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + } + + rc = spdk_iscsi_op_login_set_target_info(conn, rsp_pdu, session_type, + alloc_len, target); + if (rc < 0) { + return rc; + } + + return rc; +} + +/* + * The function which is used to initialize the internal response data + * structure of iscsi login function. + * return: + * 0, success; + * otherwise, error; + */ +static int +spdk_iscsi_op_login_rsp_init(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param **params, int *alloc_len, int *cid) +{ + + struct iscsi_bhs_login_req *reqh; + struct iscsi_bhs_login_rsp *rsph; + int rc; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->opcode = ISCSI_OP_LOGIN_RSP; + rsph->status_class = ISCSI_CLASS_SUCCESS; + rsph->status_detail = ISCSI_LOGIN_ACCEPT; + rsp_pdu->data_segment_len = 0; + + /* Default MaxRecvDataSegmentLength - RFC3720(12.12) */ + if (conn->MaxRecvDataSegmentLength < 8192) { + *alloc_len = 8192; + } else { + *alloc_len = conn->MaxRecvDataSegmentLength; + } + + rsp_pdu->data = calloc(1, *alloc_len); + if (!rsp_pdu->data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + return -ENOMEM; + } + + reqh = (struct iscsi_bhs_login_req *)&pdu->bhs; + rsph->flags |= (reqh->flags & ISCSI_LOGIN_TRANSIT); + rsph->flags |= (reqh->flags & ISCSI_LOGIN_CONTINUE); + rsph->flags |= (reqh->flags & ISCSI_LOGIN_CURRENT_STAGE_MASK); + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + rsph->flags |= (reqh->flags & ISCSI_LOGIN_NEXT_STAGE_MASK); + } + + /* We don't need to convert from network byte order. Just store it */ + memcpy(&rsph->isid, reqh->isid, 6); + rsph->tsih = reqh->tsih; + rsph->itt = reqh->itt; + rsp_pdu->cmd_sn = from_be32(&reqh->cmd_sn); + *cid = from_be16(&reqh->cid); + + if (rsph->tsih) { + rsph->stat_sn = reqh->exp_stat_sn; + } + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "PDU", (uint8_t *)&pdu->bhs, ISCSI_BHS_LEN); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "T=%d, C=%d, CSG=%d, NSG=%d, Min=%d, Max=%d, ITT=%x\n", + ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags), + ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags), + ISCSI_BHS_LOGIN_GET_CSG(rsph->flags), + ISCSI_BHS_LOGIN_GET_NSG(rsph->flags), + reqh->version_min, reqh->version_max, from_be32(&rsph->itt)); + + if (conn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u," + "MaxCmdSN=%u\n", rsp_pdu->cmd_sn, + from_be32(&rsph->stat_sn), conn->StatSN, + conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n", + rsp_pdu->cmd_sn, from_be32(&rsph->stat_sn), + conn->StatSN); + } + + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags) && + ISCSI_BHS_LOGIN_GET_CBIT(rsph->flags)) { + SPDK_ERRLOG("transit error\n"); + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + /* make sure reqh->version_max < ISCSI_VERSION */ + if (reqh->version_min > ISCSI_VERSION) { + SPDK_ERRLOG("unsupported version %d/%d\n", reqh->version_min, + reqh->version_max); + /* Unsupported version */ + /* set all reserved flag to zero */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_UNSUPPORTED_VERSION; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + if ((ISCSI_BHS_LOGIN_GET_NSG(rsph->flags) == ISCSI_NSG_RESERVED_CODE) && + ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + /* set NSG to zero */ + rsph->flags &= ~ISCSI_LOGIN_NEXT_STAGE_MASK; + /* also set other bits to zero */ + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + rsph->flags &= ~ISCSI_LOGIN_CURRENT_STAGE_MASK; + SPDK_ERRLOG("Received reserved NSG code: %d\n", ISCSI_NSG_RESERVED_CODE); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + /* store incoming parameters */ + rc = spdk_iscsi_parse_params(params, pdu->data, + pdu->data_segment_len, ISCSI_BHS_LOGIN_GET_CBIT(reqh->flags), + &conn->partial_text_parameter); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_params() failed\n"); + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + return 0; +} + +/* + * This function is used to set the csg bit case in rsp + * return: + * 0, success + * otherwise: error + */ +static int +spdk_iscsi_op_login_rsp_handle_csg_bit(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, + struct iscsi_param *params, int alloc_len) +{ + const char *auth_method; + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + switch (ISCSI_BHS_LOGIN_GET_CSG(rsph->flags)) { + case ISCSI_SECURITY_NEGOTIATION_PHASE: + /* SecurityNegotiation */ + auth_method = spdk_iscsi_param_get_val(conn->params, "AuthMethod"); + if (auth_method == NULL) { + SPDK_ERRLOG("AuthMethod is empty\n"); + /* Missing parameter */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_MISSING_PARMS; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + if (strcasecmp(auth_method, "None") == 0) { + conn->authenticated = 1; + } else { + rc = spdk_iscsi_auth_params(conn, params, auth_method, + rsp_pdu->data, alloc_len, + rsp_pdu->data_segment_len); + if (rc < 0) { + SPDK_ERRLOG("iscsi_auth_params() failed\n"); + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + rsp_pdu->data_segment_len = rc; + if (conn->authenticated == 0) { + /* not complete */ + rsph->flags &= ~ISCSI_LOGIN_TRANSIT; + } else { + if (conn->auth.chap_phase != ISCSI_CHAP_PHASE_END) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CHAP phase not complete"); + } + } + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "Negotiated Auth Params", + rsp_pdu->data, rsp_pdu->data_segment_len); + } + break; + + case ISCSI_OPERATIONAL_NEGOTIATION_PHASE: + /* LoginOperationalNegotiation */ + if (conn->state == ISCSI_CONN_STATE_INVALID) { + if (conn->req_auth) { + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } else { + /* AuthMethod=None */ + conn->authenticated = 1; + } + } + if (conn->authenticated == 0) { + SPDK_ERRLOG("authentication error\n"); + /* Authentication failure */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_AUTHENT_FAIL; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + break; + + case ISCSI_FULL_FEATURE_PHASE: + /* FullFeaturePhase */ + SPDK_ERRLOG("XXX Login in FullFeaturePhase\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + + default: + SPDK_ERRLOG("unknown stage\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* This function is used to notify the session info + * return + * 0: success + * otherwise: error + */ +static int +spdk_iscsi_op_login_notify_session_info(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu) +{ + struct spdk_iscsi_portal *portal = conn->portal; + struct iscsi_bhs_login_rsp *rsph; + + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + if (conn->sess->session_type == SESSION_TYPE_NORMAL) { + /* normal session */ + SPDK_NOTICELOG("Login from %s (%s) on %s tgt_node%d" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->target->name, conn->target->num, + portal->host, portal->port, portal->group->tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (spdk_iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (spdk_iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + /* discovery session */ + SPDK_NOTICELOG("Login(discovery) from %s (%s) on" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + portal->host, portal->port, portal->group->tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (spdk_iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (spdk_iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else { + SPDK_ERRLOG("unknown session type\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * This function is to handle the tbit cases + * return + * 0: success + * otherwise error + */ +static int +spdk_iscsi_op_login_rsp_handle_t_bit(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu) +{ + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + switch (ISCSI_BHS_LOGIN_GET_NSG(rsph->flags)) { + case ISCSI_SECURITY_NEGOTIATION_PHASE: + /* SecurityNegotiation */ + conn->login_phase = ISCSI_SECURITY_NEGOTIATION_PHASE; + break; + + case ISCSI_OPERATIONAL_NEGOTIATION_PHASE: + /* LoginOperationalNegotiation */ + conn->login_phase = ISCSI_OPERATIONAL_NEGOTIATION_PHASE; + break; + + case ISCSI_FULL_FEATURE_PHASE: + /* FullFeaturePhase */ + conn->login_phase = ISCSI_FULL_FEATURE_PHASE; + to_be16(&rsph->tsih, conn->sess->tsih); + + rc = spdk_iscsi_op_login_notify_session_info(conn, rsp_pdu); + if (rc < 0) { + return rc; + } + + conn->full_feature = 1; + spdk_iscsi_conn_migration(conn); + break; + + default: + SPDK_ERRLOG("unknown stage\n"); + /* Initiator error */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + return 0; +} + +/* + * This function is used to set the values of the internal data structure used + * by spdk_iscsi_op_login function + * return: + * 0, used to notify the a successful login + * SPDK_ISCSI_LOGIN_ERROR_RESPONSE, used to notify a failure login. + */ +static int +spdk_iscsi_op_login_rsp_handle(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *rsp_pdu, struct iscsi_param **params, + int alloc_len) +{ + int rc; + struct iscsi_bhs_login_rsp *rsph; + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + + /* negotiate parameters */ + rc = spdk_iscsi_negotiate_params(conn, params, rsp_pdu->data, alloc_len, + rsp_pdu->data_segment_len); + if (rc < 0) { + /* + * spdk_iscsi_negotiate_params just returns -1 on failure, + * so translate this into meaningful response codes and + * return values. + */ + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INITIATOR_ERROR; + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } + + rsp_pdu->data_segment_len = rc; + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "Negotiated Params", rsp_pdu->data, rc); + + /* handle the CSG bit case */ + rc = spdk_iscsi_op_login_rsp_handle_csg_bit(conn, rsp_pdu, *params, + alloc_len); + if (rc < 0) { + return rc; + } + + /* handle the T bit case */ + if (ISCSI_BHS_LOGIN_GET_TBIT(rsph->flags)) { + rc = spdk_iscsi_op_login_rsp_handle_t_bit(conn, rsp_pdu); + } + + return rc; +} + +static int +spdk_iscsi_op_login(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int rc; + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_param *params = NULL; + struct iscsi_param **params_p = ¶ms; + int alloc_len; + int cid; + + if (conn->full_feature && conn->sess != NULL && + conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rc = spdk_iscsi_op_login_rsp_init(conn, pdu, rsp_pdu, params_p, + &alloc_len, &cid); + if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE || rc == SPDK_ISCSI_LOGIN_ERROR_PARAMETER) { + spdk_iscsi_op_login_response(conn, rsp_pdu, *params_p); + return rc; + } + + /* For other values, we need to directly return */ + if (rc < 0) { + spdk_put_pdu(rsp_pdu); + return rc; + } + + if (conn->state == ISCSI_CONN_STATE_INVALID) { + rc = spdk_iscsi_op_login_phase_none(conn, rsp_pdu, *params_p, + alloc_len, cid); + if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE || rc == SPDK_ISCSI_LOGIN_ERROR_PARAMETER) { + spdk_iscsi_op_login_response(conn, rsp_pdu, *params_p); + return rc; + } + } + + rc = spdk_iscsi_op_login_rsp_handle(conn, rsp_pdu, params_p, alloc_len); + if (rc == SPDK_ISCSI_LOGIN_ERROR_RESPONSE) { + spdk_iscsi_op_login_response(conn, rsp_pdu, *params_p); + return rc; + } + + rc = spdk_iscsi_op_login_response(conn, rsp_pdu, *params_p); + if (rc == 0) { + conn->state = ISCSI_CONN_STATE_RUNNING; + } else { + SPDK_ERRLOG("login error - connection will be destroyed\n"); + } + + return rc; +} + +static int +spdk_iscsi_op_text(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_param *params = NULL; + struct iscsi_param **params_p = ¶ms; + struct spdk_iscsi_pdu *rsp_pdu; + uint8_t *data; + uint64_t lun; + uint32_t task_tag; + uint32_t CmdSN; + uint32_t ExpStatSN; + const char *val; + int F_bit, C_bit; + int data_len; + int alloc_len; + int rc; + struct iscsi_bhs_text_req *reqh; + struct iscsi_bhs_text_resp *rsph; + + data_len = 0; + alloc_len = conn->MaxRecvDataSegmentLength; + + reqh = (struct iscsi_bhs_text_req *)&pdu->bhs; + + F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL); + C_bit = !!(reqh->flags & ISCSI_TEXT_CONTINUE); + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + CmdSN = from_be32(&reqh->cmd_sn); + pdu->cmd_sn = CmdSN; + ExpStatSN = from_be32(&reqh->exp_stat_sn); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, F=%d, C=%d, ITT=%x, TTT=%x\n", + reqh->immediate, F_bit, C_bit, task_tag, from_be32(&reqh->ttt)); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + CmdSN, ExpStatSN, conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + + if (ExpStatSN != conn->StatSN) { +#if 0 + SPDK_ERRLOG("StatSN(%u) error\n", ExpStatSN); + return -1; +#else + /* StarPort have a bug */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) rewound\n", ExpStatSN); + conn->StatSN = ExpStatSN; +#endif + } + + if (F_bit && C_bit) { + SPDK_ERRLOG("final and continue\n"); + return -1; + } + + /* + * If this is the first text op in a sequence, save the ITT so we can + * compare it against the ITT for subsequent ops in the same sequence. + * If a subsequent text op in same sequence has a different ITT, reject + * that PDU. + */ + if (conn->sess->current_text_itt == 0xffffffffU) { + conn->sess->current_text_itt = task_tag; + } else if (conn->sess->current_text_itt != task_tag) { + SPDK_ERRLOG("The correct itt is %u, and the current itt is %u...\n", + conn->sess->current_text_itt, task_tag); + return spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + /* store incoming parameters */ + rc = spdk_iscsi_parse_params(¶ms, pdu->data, pdu->data_segment_len, + C_bit, &conn->partial_text_parameter); + if (rc < 0) { + SPDK_ERRLOG("iscsi_parse_params() failed\n"); + spdk_iscsi_param_free(params); + return -1; + } + + data = calloc(1, alloc_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for data segment\n"); + spdk_iscsi_param_free(params); + return -ENOMEM; + } + + /* negotiate parameters */ + data_len = spdk_iscsi_negotiate_params(conn, params_p, + data, alloc_len, data_len); + if (data_len < 0) { + SPDK_ERRLOG("spdk_iscsi_negotiate_params() failed\n"); + spdk_iscsi_param_free(*params_p); + free(data); + return -1; + } + + /* sendtargets is special case */ + val = spdk_iscsi_param_get_val(*params_p, "SendTargets"); + if (val != NULL) { + if (spdk_iscsi_param_eq_val(conn->sess->params, + "SessionType", "Discovery")) { + if (strcasecmp(val, "") == 0) { + val = "ALL"; + } + + data_len = spdk_iscsi_send_tgts(conn, + conn->initiator_name, + conn->initiator_addr, + val, data, alloc_len, + data_len); + } else { + if (strcasecmp(val, "") == 0) { + val = conn->target->name; + } + + if (strcasecmp(val, "ALL") == 0) { + /* not in discovery session */ + data_len = spdk_iscsi_append_text(conn, + "SendTargets", + "Reject", data, + alloc_len, + data_len); + } else { + data_len = spdk_iscsi_send_tgts(conn, + conn->initiator_name, + conn->initiator_addr, + val, data, alloc_len, + data_len); + } + } + } else { + if (spdk_iscsi_param_eq_val(conn->sess->params, "SessionType", "Discovery")) { + spdk_iscsi_param_free(*params_p); + free(data); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "Negotiated Params", data, data_len); + + /* response PDU */ + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + spdk_iscsi_param_free(*params_p); + free(data); + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_text_resp *)&rsp_pdu->bhs; + + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_TEXT_RSP; + + if (F_bit) { + rsph->flags |= ISCSI_FLAG_FINAL; + } + + if (C_bit) { + rsph->flags |= ISCSI_TEXT_CONTINUE; + } + + DSET24(rsph->data_segment_len, data_len); + to_be64(&rsph->lun, lun); + to_be32(&rsph->itt, task_tag); + + if (F_bit) { + rsph->ttt = 0xffffffffU; + conn->sess->current_text_itt = 0xffffffffU; + } else { + to_be32(&rsph->ttt, 1 + conn->id); + } + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (reqh->immediate == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + /* update internal variables */ + rc = spdk_iscsi_copy_param2var(conn); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_copy_param2var() failed\n"); + spdk_iscsi_param_free(*params_p); + return -1; + } + + /* check value */ + rc = spdk_iscsi_check_values(conn); + if (rc < 0) { + SPDK_ERRLOG("iscsi_check_values() failed\n"); + spdk_iscsi_param_free(*params_p); + return -1; + } + + spdk_iscsi_param_free(*params_p); + return 0; +} + +static int +spdk_iscsi_op_logout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + char buf[MAX_TMPBUF]; + struct spdk_iscsi_pdu *rsp_pdu; + uint32_t task_tag; + uint32_t CmdSN; + uint32_t ExpStatSN; + int response; + struct iscsi_bhs_logout_req *reqh; + struct iscsi_bhs_logout_resp *rsph; + uint16_t cid; + + reqh = (struct iscsi_bhs_logout_req *)&pdu->bhs; + + cid = from_be16(&reqh->cid); + task_tag = from_be32(&reqh->itt); + CmdSN = from_be32(&reqh->cmd_sn); + pdu->cmd_sn = CmdSN; + ExpStatSN = from_be32(&reqh->exp_stat_sn); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "reason=%d, ITT=%x, cid=%d\n", + reqh->reason, task_tag, cid); + + if (reqh->reason != 0 && conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("only logout with close the session reason can be in discovery session"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + if (conn->sess != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "CmdSN=%u, ExpStatSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + CmdSN, ExpStatSN, conn->StatSN, + conn->sess->ExpCmdSN, conn->sess->MaxCmdSN); + + if (CmdSN != conn->sess->ExpCmdSN) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN(%u) might have dropped\n", CmdSN); + /* ignore error */ + } + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, ExpStatSN=%u, StatSN=%u\n", + CmdSN, ExpStatSN, conn->StatSN); + } + + if (ExpStatSN != conn->StatSN) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u/%u) might have dropped\n", + ExpStatSN, conn->StatSN); + /* ignore error */ + } + + if (conn->id == cid) { + response = 0; // connection or session closed successfully + spdk_iscsi_conn_logout(conn); + } else { + response = 1; + } + + /* response PDU */ + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_logout_resp *)&rsp_pdu->bhs; + rsp_pdu->data = NULL; + rsph->opcode = ISCSI_OP_LOGOUT_RSP; + rsph->flags |= 0x80; /* bit 0 must be 1 */ + rsph->response = response; + DSET24(rsph->data_segment_len, 0); + to_be32(&rsph->itt, task_tag); + + if (conn->sess != NULL) { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (conn->sess->connections == 1) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + } else { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + to_be32(&rsph->exp_cmd_sn, CmdSN); + to_be32(&rsph->max_cmd_sn, CmdSN); + } + + rsph->time_2_wait = 0; + rsph->time_2_retain = 0; + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + if (conn->sess == NULL) { + /* + * login failed but initiator still sent a logout rather than + * just closing the TCP connection. + */ + snprintf(buf, sizeof buf, "Logout(login failed) from %s (%s) on" + " (%s:%s,%d)\n", + conn->initiator_name, conn->initiator_addr, + conn->portal_host, conn->portal_port, conn->pg_tag); + } else if (spdk_iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) { + snprintf(buf, sizeof buf, "Logout from %s (%s) on %s tgt_node%d" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->target->name, conn->target->num, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (spdk_iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (spdk_iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } else { + /* discovery session */ + snprintf(buf, sizeof buf, "Logout(discovery) from %s (%s) on" + " (%s:%s,%d), ISID=%"PRIx64", TSIH=%u," + " CID=%u, HeaderDigest=%s, DataDigest=%s\n", + conn->initiator_name, conn->initiator_addr, + conn->portal_host, conn->portal_port, conn->pg_tag, + conn->sess->isid, conn->sess->tsih, conn->cid, + (spdk_iscsi_param_eq_val(conn->params, "HeaderDigest", "CRC32C") + ? "on" : "off"), + (spdk_iscsi_param_eq_val(conn->params, "DataDigest", "CRC32C") + ? "on" : "off")); + } + + SPDK_NOTICELOG("%s", buf); + + return 0; +} + +/* This function returns the spdk_scsi_task by searching the snack list via + * task transfertag and the pdu's opcode + */ +static struct spdk_iscsi_task * +spdk_get_scsi_task_from_ttt(struct spdk_iscsi_conn *conn, + uint32_t transfer_tag) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_data_in *datain_bhs; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_bhs = (struct iscsi_bhs_data_in *)&pdu->bhs; + if (from_be32(&datain_bhs->ttt) == transfer_tag) { + return pdu->task; + } + } + } + + return NULL; +} + +/* This function returns the spdk_scsi_task by searching the snack list via + * initiator task tag and the pdu's opcode + */ +static struct spdk_iscsi_task * +spdk_get_scsi_task_from_itt(struct spdk_iscsi_conn *conn, + uint32_t task_tag, enum iscsi_op opcode) +{ + struct spdk_iscsi_pdu *pdu; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == opcode && + pdu->task != NULL && + pdu->task->tag == task_tag) { + return pdu->task; + } + } + + return NULL; +} + +static int +spdk_iscsi_send_datain(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, int datain_flag, + int residual_len, int offset, int DataSN, int len) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_data_in *rsph; + uint32_t task_tag; + uint32_t transfer_tag; + int F_bit, U_bit, O_bit, S_bit; + struct spdk_iscsi_task *primary; + + primary = spdk_iscsi_task_get_primary(task); + + /* DATA PDU */ + rsp_pdu = spdk_get_pdu(); + rsph = (struct iscsi_bhs_data_in *)&rsp_pdu->bhs; + rsp_pdu->data = task->scsi.iovs[0].iov_base + offset; + rsp_pdu->data_from_mempool = true; + + task_tag = task->tag; + transfer_tag = 0xffffffffU; + + F_bit = datain_flag & ISCSI_FLAG_FINAL; + O_bit = datain_flag & ISCSI_DATAIN_OVERFLOW; + U_bit = datain_flag & ISCSI_DATAIN_UNDERFLOW; + S_bit = datain_flag & ISCSI_DATAIN_STATUS; + + /* + * we need to hold onto this task/cmd because until the + * PDU has been written out + */ + rsp_pdu->task = task; + task->scsi.ref++; + + rsph->opcode = ISCSI_OP_SCSI_DATAIN; + + if (F_bit) { + rsph->flags |= ISCSI_FLAG_FINAL; + } + + /* we leave the A_bit clear */ + + if (F_bit && S_bit) { + if (O_bit) { + rsph->flags |= ISCSI_DATAIN_OVERFLOW; + } + + if (U_bit) { + rsph->flags |= ISCSI_DATAIN_UNDERFLOW; + } + } + + if (S_bit) { + rsph->flags |= ISCSI_DATAIN_STATUS; + rsph->status = task->scsi.status; + } + + DSET24(rsph->data_segment_len, len); + + to_be32(&rsph->itt, task_tag); + to_be32(&rsph->ttt, transfer_tag); + + if (S_bit) { + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + } + + if (F_bit && S_bit && !spdk_iscsi_task_is_immediate(primary)) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->data_sn, DataSN); + + if (conn->sess->ErrorRecoveryLevel >= 1) { + primary->datain_datasn = DataSN; + } + DataSN++; + + if (task->parent) { + offset += primary->scsi.data_transferred; + } + to_be32(&rsph->buffer_offset, (uint32_t)offset); + + if (F_bit && S_bit) { + to_be32(&rsph->res_cnt, residual_len); + } + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + return DataSN; +} + +static int +spdk_iscsi_transfer_in(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + uint32_t DataSN; + int transfer_len; + int data_len; + int segment_len; + int offset; + int residual_len = 0; + int sent_status; + int len; + int datain_flag = 0; + int datain_seq_cnt; + int i; + int sequence_end; + struct spdk_iscsi_task *primary; + + primary = spdk_iscsi_task_get_primary(task); + segment_len = conn->MaxRecvDataSegmentLength; + data_len = task->scsi.data_transferred; + transfer_len = task->scsi.length; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + if (task != primary) { + conn->data_in_cnt--; + /* Handle the case when primary task return success but the subtask failed */ + if (primary->bytes_completed == primary->scsi.transfer_len && + primary->scsi.status == SPDK_SCSI_STATUS_GOOD) { + conn->data_in_cnt--; + } + } else { + /* handle the case that it is a primary task which has subtasks */ + if (primary->scsi.transfer_len != primary->scsi.length) { + conn->data_in_cnt--; + } + } + + return 0; + } + + if (data_len < transfer_len) { + /* underflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %u/%u\n", data_len, transfer_len); + residual_len = transfer_len - data_len; + transfer_len = data_len; + datain_flag |= ISCSI_DATAIN_UNDERFLOW; + } else if (data_len > transfer_len) { + /* overflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %u/%u\n", data_len, transfer_len); + residual_len = data_len - transfer_len; + datain_flag |= ISCSI_DATAIN_OVERFLOW; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len); + residual_len = 0; + } + + DataSN = primary->datain_datasn; + sent_status = 0; + + /* calculate the number of sequences for all data-in pdus */ + datain_seq_cnt = 1 + ((transfer_len - 1) / (int)conn->sess->MaxBurstLength); + for (i = 0; i < datain_seq_cnt; i++) { + offset = i * conn->sess->MaxBurstLength; + sequence_end = DMIN32(((i + 1) * conn->sess->MaxBurstLength), + transfer_len); + + /* send data splitted by segment_len */ + for (; offset < sequence_end; offset += segment_len) { + len = DMIN32(segment_len, (sequence_end - offset)); + + datain_flag &= ~ISCSI_FLAG_FINAL; + datain_flag &= ~ISCSI_DATAIN_STATUS; + + if (offset + len == sequence_end) { + /* last PDU in a sequence */ + datain_flag |= ISCSI_FLAG_FINAL; + if (task->scsi.sense_data_len == 0) { + /* The last pdu in all data-in pdus */ + if ((offset + len) == transfer_len && + (primary->bytes_completed == primary->scsi.transfer_len)) { + datain_flag |= ISCSI_DATAIN_STATUS; + sent_status = 1; + } + } + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer=%d, Offset=%d, Len=%d\n", + sequence_end, offset, len); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, DataSN=%u, Offset=%u, Len=%d\n", + conn->StatSN, DataSN, offset, len); + + DataSN = spdk_iscsi_send_datain(conn, task, datain_flag, residual_len, + offset, DataSN, len); + } + } + + if (task != primary) { + primary->scsi.data_transferred += task->scsi.data_transferred; + } + primary->datain_datasn = DataSN; + + return sent_status; +} + +/* + * This function compare the input pdu's bhs with the pdu's bhs associated by + * active_r2t_tasks and queued_r2t_tasks in a connection + */ +static bool +spdk_iscsi_compare_pdu_bhs_within_existed_r2t_tasks(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task; + + TAILQ_FOREACH(task, &conn->active_r2t_tasks, link) { + if (!memcmp(&pdu->bhs, spdk_iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) { + return true; + } + } + + TAILQ_FOREACH(task, &conn->queued_r2t_tasks, link) { + if (!memcmp(&pdu->bhs, spdk_iscsi_task_get_bhs(task), ISCSI_BHS_LEN)) { + return true; + } + } + + return false; +} + +static void spdk_iscsi_queue_task(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + spdk_trace_record(TRACE_ISCSI_TASK_QUEUE, conn->id, task->scsi.length, + (uintptr_t)task, (uintptr_t)task->pdu); + task->is_queued = true; + spdk_scsi_dev_queue_task(conn->dev, &task->scsi); +} + +static void spdk_iscsi_queue_mgmt_task(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + enum spdk_scsi_task_func func) +{ + spdk_scsi_dev_queue_mgmt_task(conn->dev, &task->scsi, func); +} + +int spdk_iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_task *task; + + while (!TAILQ_EMPTY(&conn->queued_datain_tasks) && + conn->data_in_cnt < MAX_LARGE_DATAIN_PER_CONNECTION) { + task = TAILQ_FIRST(&conn->queued_datain_tasks); + assert(task->current_datain_offset <= task->scsi.transfer_len); + + if (task->current_datain_offset == 0) { + task->scsi.lun = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + if (task->scsi.lun == NULL) { + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + spdk_scsi_task_process_null_lun(&task->scsi); + spdk_iscsi_task_cpl(&task->scsi); + return 0; + } + task->current_datain_offset = task->scsi.length; + conn->data_in_cnt++; + spdk_iscsi_queue_task(conn, task); + continue; + } + if (task->current_datain_offset < task->scsi.transfer_len) { + struct spdk_iscsi_task *subtask; + uint32_t remaining_size = 0; + + remaining_size = task->scsi.transfer_len - task->current_datain_offset; + subtask = spdk_iscsi_task_get(conn, task, spdk_iscsi_task_cpl); + assert(subtask != NULL); + subtask->scsi.offset = task->current_datain_offset; + subtask->scsi.length = DMIN32(SPDK_BDEV_LARGE_BUF_MAX_SIZE, remaining_size); + spdk_scsi_task_set_data(&subtask->scsi, NULL, 0); + task->current_datain_offset += subtask->scsi.length; + conn->data_in_cnt++; + + task->scsi.lun = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + if (task->scsi.lun == NULL) { + /* Remove the primary task from the list if this is the last subtask */ + if (task->current_datain_offset == task->scsi.transfer_len) { + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + } + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_null_lun(&subtask->scsi); + spdk_iscsi_task_cpl(&subtask->scsi); + return 0; + } + + spdk_iscsi_queue_task(conn, subtask); + } + if (task->current_datain_offset == task->scsi.transfer_len) { + TAILQ_REMOVE(&conn->queued_datain_tasks, task, link); + } + } + return 0; +} + +static int spdk_iscsi_op_scsi_read(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + int32_t remaining_size; + + TAILQ_INIT(&task->subtask_list); + task->scsi.dxfer_dir = SPDK_SCSI_DIR_FROM_DEV; + task->parent = NULL; + task->scsi.offset = 0; + task->scsi.length = DMIN32(SPDK_BDEV_LARGE_BUF_MAX_SIZE, task->scsi.transfer_len); + spdk_scsi_task_set_data(&task->scsi, NULL, 0); + + remaining_size = task->scsi.transfer_len - task->scsi.length; + task->current_datain_offset = 0; + + if (remaining_size == 0) { + spdk_iscsi_queue_task(conn, task); + return 0; + } + + TAILQ_INSERT_TAIL(&conn->queued_datain_tasks, task, link); + + return spdk_iscsi_conn_handle_queued_datain_tasks(conn); +} + +static int +spdk_iscsi_op_scsi(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task; + struct spdk_scsi_dev *dev; + uint8_t *cdb; + uint64_t lun; + uint32_t task_tag; + uint32_t transfer_len; + int F_bit, R_bit, W_bit; + int lun_i, rc; + struct iscsi_bhs_scsi_req *reqh; + + if (conn->sess->session_type != SESSION_TYPE_NORMAL) { + SPDK_ERRLOG("ISCSI_OP_SCSI not allowed in discovery and invalid session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs; + + F_bit = reqh->final_bit; + R_bit = reqh->read_bit; + W_bit = reqh->write_bit; + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + transfer_len = from_be32(&reqh->expected_data_xfer_len); + cdb = reqh->cdb; + + SPDK_TRACEDUMP(SPDK_LOG_ISCSI, "CDB", cdb, 16); + + task = spdk_iscsi_task_get(conn, NULL, spdk_iscsi_task_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + spdk_iscsi_task_associate_pdu(task, pdu); + lun_i = spdk_islun2lun(lun); + task->lun_id = lun_i; + dev = conn->dev; + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i); + + if ((R_bit != 0) && (W_bit != 0)) { + SPDK_ERRLOG("Bidirectional CDB is not supported\n"); + spdk_iscsi_task_put(task); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + task->scsi.cdb = cdb; + task->tag = task_tag; + task->scsi.transfer_len = transfer_len; + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->parent = NULL; + + if (task->scsi.lun == NULL) { + spdk_scsi_task_process_null_lun(&task->scsi); + spdk_iscsi_task_cpl(&task->scsi); + return 0; + } + + /* no bi-directional support */ + if (R_bit) { + return spdk_iscsi_op_scsi_read(conn, task); + } else if (W_bit) { + task->scsi.dxfer_dir = SPDK_SCSI_DIR_TO_DEV; + + if ((conn->sess->ErrorRecoveryLevel >= 1) && + (spdk_iscsi_compare_pdu_bhs_within_existed_r2t_tasks(conn, pdu))) { + spdk_iscsi_task_response(conn, task); + spdk_iscsi_task_put(task); + return 0; + } + + if (pdu->data_segment_len > transfer_len) { + SPDK_ERRLOG("data segment len(=%d) > task transfer len(=%d)\n", + (int)pdu->data_segment_len, transfer_len); + spdk_iscsi_task_put(task); + rc = spdk_iscsi_reject(conn, pdu, + ISCSI_REASON_PROTOCOL_ERROR); + if (rc < 0) { + SPDK_ERRLOG("iscsi_reject() failed\n"); + } + return rc; + } + + /* check the ImmediateData and also pdu->data_segment_len */ + if ((!conn->sess->ImmediateData && (pdu->data_segment_len > 0)) || + (pdu->data_segment_len > conn->sess->FirstBurstLength)) { + spdk_iscsi_task_put(task); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + if (rc < 0) { + SPDK_ERRLOG("iscsi_reject() failed\n"); + } + return rc; + } + + if (F_bit && pdu->data_segment_len < transfer_len) { + /* needs R2T */ + rc = spdk_add_transfer_task(conn, task); + if (rc < 0) { + SPDK_ERRLOG("add_transfer_task() failed\n"); + spdk_iscsi_task_put(task); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + /* Non-immediate writes */ + if (pdu->data_segment_len == 0) { + return 0; + } else { + /* we are doing the first partial write task */ + task->scsi.ref++; + spdk_scsi_task_set_data(&task->scsi, pdu->data, pdu->data_segment_len); + task->scsi.length = pdu->data_segment_len; + } + } + + if (pdu->data_segment_len == transfer_len) { + /* we are doing small writes with no R2T */ + spdk_scsi_task_set_data(&task->scsi, pdu->data, transfer_len); + task->scsi.length = transfer_len; + } + } else { + /* neither R nor W bit set */ + task->scsi.dxfer_dir = SPDK_SCSI_DIR_NONE; + if (transfer_len > 0) { + spdk_iscsi_task_put(task); + SPDK_ERRLOG("Reject scsi cmd with EDTL > 0 but (R | W) == 0\n"); + return spdk_iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD); + } + } + + spdk_iscsi_queue_task(conn, task); + return 0; +} + +void +spdk_iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_task_req *reqh; + struct iscsi_bhs_task_resp *rsph; + + if (task->pdu == NULL) { + /* + * This was an internally generated task management command, + * usually from LUN cleanup when a connection closes. + */ + return; + } + + reqh = (struct iscsi_bhs_task_req *)&task->pdu->bhs; + /* response PDU */ + rsp_pdu = spdk_get_pdu(); + rsph = (struct iscsi_bhs_task_resp *)&rsp_pdu->bhs; + rsph->opcode = ISCSI_OP_TASK_RSP; + rsph->flags |= 0x80; /* bit 0 default to 1 */ + switch (task->scsi.response) { + case SPDK_SCSI_TASK_MGMT_RESP_COMPLETE: + rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; + break; + case SPDK_SCSI_TASK_MGMT_RESP_SUCCESS: + rsph->response = ISCSI_TASK_FUNC_RESP_COMPLETE; + break; + case SPDK_SCSI_TASK_MGMT_RESP_REJECT: + rsph->response = ISCSI_TASK_FUNC_REJECTED; + break; + case SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN: + rsph->response = ISCSI_TASK_FUNC_RESP_LUN_NOT_EXIST; + break; + case SPDK_SCSI_TASK_MGMT_RESP_TARGET_FAILURE: + rsph->response = ISCSI_TASK_FUNC_REJECTED; + break; + case SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED: + rsph->response = ISCSI_TASK_FUNC_RESP_FUNC_NOT_SUPPORTED; + break; + } + rsph->itt = reqh->itt; + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (reqh->immediate == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); +} + +void spdk_iscsi_task_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_scsi_resp *rsph; + uint32_t task_tag; + uint32_t transfer_len; + size_t residual_len; + size_t data_len; + int O_bit, U_bit; + int rc; + struct spdk_iscsi_task *primary; + + primary = spdk_iscsi_task_get_primary(task); + + transfer_len = primary->scsi.transfer_len; + task_tag = task->tag; + + /* transfer data from logical unit */ + /* (direction is view of initiator side) */ + if (spdk_iscsi_task_is_read(primary)) { + rc = spdk_iscsi_transfer_in(conn, task); + if (rc > 0) { + /* sent status by last DATAIN PDU */ + return; + } + + if (primary->bytes_completed != primary->scsi.transfer_len) { + return; + } + } + + O_bit = U_bit = 0; + residual_len = 0; + data_len = primary->scsi.data_transferred; + + if ((transfer_len != 0) && + (task->scsi.status == SPDK_SCSI_STATUS_GOOD)) { + if (data_len < transfer_len) { + /* underflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Underflow %zu/%u\n", data_len, transfer_len); + residual_len = transfer_len - data_len; + U_bit = 1; + } else if (data_len > transfer_len) { + /* overflow */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Overflow %zu/%u\n", data_len, transfer_len); + residual_len = data_len - transfer_len; + O_bit = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Transfer %u\n", transfer_len); + } + } + + /* response PDU */ + rsp_pdu = spdk_get_pdu(); + assert(rsp_pdu != NULL); + rsph = (struct iscsi_bhs_scsi_resp *)&rsp_pdu->bhs; + assert(task->scsi.sense_data_len <= sizeof(rsp_pdu->sense.data)); + memcpy(rsp_pdu->sense.data, task->scsi.sense_data, task->scsi.sense_data_len); + to_be16(&rsp_pdu->sense.length, task->scsi.sense_data_len); + rsp_pdu->data = (uint8_t *)&rsp_pdu->sense; + rsp_pdu->data_from_mempool = true; + + /* + * we need to hold onto this task/cmd because until the + * PDU has been written out + */ + rsp_pdu->task = task; + task->scsi.ref++; + + rsph->opcode = ISCSI_OP_SCSI_RSP; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + + if (O_bit) { + rsph->flags |= ISCSI_SCSI_OVERFLOW; + } + + if (U_bit) { + rsph->flags |= ISCSI_SCSI_UNDERFLOW; + } + + rsph->status = task->scsi.status; + if (task->scsi.sense_data_len) { + /* SenseLength (2 bytes) + SenseData */ + DSET24(rsph->data_segment_len, 2 + task->scsi.sense_data_len); + } + to_be32(&rsph->itt, task_tag); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (!spdk_iscsi_task_is_immediate(primary)) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->bi_read_res_cnt, 0); + to_be32(&rsph->res_cnt, residual_len); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); +} + +static struct spdk_iscsi_task * +spdk_get_transfer_task(struct spdk_iscsi_conn *conn, uint32_t transfer_tag) +{ + int i; + + for (i = 0; i < conn->pending_r2t; i++) { + if (conn->outstanding_r2t_tasks[i]->ttt == transfer_tag) { + return (conn->outstanding_r2t_tasks[i]); + } + } + + return NULL; +} + +static int +spdk_iscsi_op_task(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_bhs_task_req *reqh; + uint64_t lun; + uint32_t task_tag; + uint32_t ref_task_tag; + uint8_t function; + int lun_i; + struct spdk_iscsi_task *task; + struct spdk_scsi_dev *dev; + + if (conn->sess->session_type != SESSION_TYPE_NORMAL) { + SPDK_ERRLOG("ISCSI_OP_TASK not allowed in discovery and invalid session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_task_req *)&pdu->bhs; + function = reqh->flags & ISCSI_TASK_FUNCTION_MASK; + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + ref_task_tag = from_be32(&reqh->ref_task_tag); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, func=%d, ITT=%x, ref TT=%x, LUN=0x%16.16"PRIx64"\n", + reqh->immediate, function, task_tag, ref_task_tag, lun); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, conn->sess->MaxCmdSN); + + lun_i = spdk_islun2lun(lun); + dev = conn->dev; + + task = spdk_iscsi_task_get(conn, NULL, spdk_iscsi_task_mgmt_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + spdk_iscsi_task_associate_pdu(task, pdu); + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->tag = task_tag; + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_i); + + switch (function) { + /* abort task identified by Referenced Task Tag field */ + case ISCSI_TASK_FUNC_ABORT_TASK: + SPDK_NOTICELOG("ABORT_TASK\n"); + + task->scsi.abort_id = ref_task_tag; + + spdk_iscsi_queue_mgmt_task(conn, task, SPDK_SCSI_TASK_FUNC_ABORT_TASK); + spdk_del_transfer_task(conn, ref_task_tag); + + return SPDK_SUCCESS; + + /* abort all tasks issued via this session on the LUN */ + case ISCSI_TASK_FUNC_ABORT_TASK_SET: + SPDK_NOTICELOG("ABORT_TASK_SET\n"); + + spdk_iscsi_queue_mgmt_task(conn, task, SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET); + spdk_clear_all_transfer_task(conn, task->scsi.lun); + + return SPDK_SUCCESS; + + case ISCSI_TASK_FUNC_CLEAR_TASK_SET: + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_NOTICELOG("CLEAR_TASK_SET (Unsupported)\n"); + break; + + case ISCSI_TASK_FUNC_CLEAR_ACA: + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_NOTICELOG("CLEAR_ACA (Unsupported)\n"); + break; + + case ISCSI_TASK_FUNC_LOGICAL_UNIT_RESET: + SPDK_NOTICELOG("LOGICAL_UNIT_RESET\n"); + + spdk_iscsi_queue_mgmt_task(conn, task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + spdk_clear_all_transfer_task(conn, task->scsi.lun); + return SPDK_SUCCESS; + + case ISCSI_TASK_FUNC_TARGET_WARM_RESET: + SPDK_NOTICELOG("TARGET_WARM_RESET (Unsupported)\n"); + +#if 0 + spdk_iscsi_drop_conns(conn, conn->initiator_name, 1 /* drop all */); + rc = spdk_iscsi_tgt_node_reset(conn->sess->target, lun); + if (rc < 0) { + SPDK_ERRLOG("tgt_node reset failed\n"); + } +#else + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; +#endif + break; + + case ISCSI_TASK_FUNC_TARGET_COLD_RESET: + SPDK_NOTICELOG("TARGET_COLD_RESET\n"); + +#if 0 + spdk_iscsi_drop_conns(conn, conn->initiator_name, 1 /* drop all */); + + rc = spdk_iscsi_tgt_node_reset(conn->sess->target, lun); + if (rc < 0) { + SPDK_ERRLOG("tgt_node reset failed\n"); + } + + conn->state = ISCSI_CONN_STATE_EXITING; +#else + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; +#endif + break; + + case ISCSI_TASK_FUNC_TASK_REASSIGN: + SPDK_NOTICELOG("TASK_REASSIGN (Unsupported)\n"); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + + default: + SPDK_ERRLOG("unsupported function %d\n", function); + task->scsi.response = SPDK_SCSI_TASK_MGMT_RESP_REJECT; + break; + } + + spdk_iscsi_task_mgmt_response(conn, task); + spdk_iscsi_task_put(task); + return 0; +} + +static int +spdk_iscsi_op_nopout(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_nop_out *reqh; + struct iscsi_bhs_nop_in *rsph; + uint8_t *data; + uint64_t lun; + uint32_t task_tag; + uint32_t transfer_tag; + uint32_t CmdSN; + int I_bit; + int data_len; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_NOPOUT not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_nop_out *)&pdu->bhs; + I_bit = reqh->immediate; + + data_len = DGET24(reqh->data_segment_len); + if (data_len > conn->MaxRecvDataSegmentLength) { + data_len = conn->MaxRecvDataSegmentLength; + } + + lun = from_be64(&reqh->lun); + task_tag = from_be32(&reqh->itt); + transfer_tag = from_be32(&reqh->ttt); + CmdSN = from_be32(&reqh->cmd_sn); + pdu->cmd_sn = CmdSN; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "I=%d, ITT=%x, TTT=%x\n", + I_bit, task_tag, transfer_tag); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "CmdSN=%u, StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + CmdSN, conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + + if (transfer_tag != 0xFFFFFFFF && transfer_tag != (uint32_t)conn->id) { + SPDK_ERRLOG("invalid transfer tag 0x%x\n", transfer_tag); + /* + * Technically we should probably fail the connection here, but for now + * just print the error message and continue. + */ + } + + /* + * We don't actually check to see if this is a response to the NOP-In + * that we sent. Our goal is to just verify that the initiator is + * alive and responding to commands, not to verify that it tags + * NOP-Outs correctly + */ + conn->nop_outstanding = false; + + if (task_tag == 0xffffffffU) { + if (I_bit == 1) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "got NOPOUT ITT=0xffffffff\n"); + return SPDK_SUCCESS; + } else { + SPDK_ERRLOG("got NOPOUT ITT=0xffffffff, I=0\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + data = calloc(1, data_len); + if (!data) { + SPDK_ERRLOG("calloc() failed for ping data\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + /* response of NOPOUT */ + if (data_len > 0) { + /* copy ping data */ + memcpy(data, pdu->data, data_len); + } + + transfer_tag = 0xffffffffU; + + /* response PDU */ + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + free(data); + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_nop_in *)&rsp_pdu->bhs; + rsp_pdu->data = data; + rsph->opcode = ISCSI_OP_NOPIN; + rsph->flags |= 0x80; /* bit 0 default to 1 */ + DSET24(rsph->data_segment_len, data_len); + to_be64(&rsph->lun, lun); + to_be32(&rsph->itt, task_tag); + to_be32(&rsph->ttt, transfer_tag); + + to_be32(&rsph->stat_sn, conn->StatSN); + conn->StatSN++; + + if (I_bit == 0) { + conn->sess->MaxCmdSN++; + } + + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + conn->last_nopin = spdk_get_ticks(); + + return SPDK_SUCCESS; +} + +static int +spdk_add_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task) +{ + uint32_t transfer_len; + size_t max_burst_len; + size_t segment_len; + size_t data_len; + int len; + int idx; + int rc; + int data_out_req; + + transfer_len = task->scsi.transfer_len; + data_len = spdk_iscsi_task_get_pdu(task)->data_segment_len; + max_burst_len = conn->sess->MaxBurstLength; + segment_len = SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; + data_out_req = 1 + (transfer_len - data_len - 1) / segment_len; + task->data_out_cnt = data_out_req; + + /* + * If we already have too many tasks using R2T, then queue this task + * and start sending R2T for it after some of the tasks using R2T/data + * out buffers complete. + */ + if (conn->pending_r2t >= DEFAULT_MAXR2T) { + TAILQ_INSERT_TAIL(&conn->queued_r2t_tasks, task, link); + return SPDK_SUCCESS; + } + + conn->data_out_cnt += data_out_req; + idx = conn->pending_r2t++; + + conn->outstanding_r2t_tasks[idx] = task; + task->next_expected_r2t_offset = data_len; + task->current_r2t_length = 0; + task->R2TSN = 0; + /* According to RFC3720 10.8.5, 0xffffffff is + * reserved for TTT in R2T. + */ + if (++conn->ttt == 0xffffffffu) { + conn->ttt = 0; + } + task->ttt = conn->ttt; + + while (data_len != transfer_len) { + len = DMIN32(max_burst_len, (transfer_len - data_len)); + rc = spdk_iscsi_send_r2t(conn, task, data_len, len, + task->ttt, &task->R2TSN); + if (rc < 0) { + SPDK_ERRLOG("iscsi_send_r2t() failed\n"); + return rc; + } + data_len += len; + task->next_r2t_offset = data_len; + task->outstanding_r2t++; + if (conn->sess->MaxOutstandingR2T == task->outstanding_r2t) { + break; + } + } + + TAILQ_INSERT_TAIL(&conn->active_r2t_tasks, task, link); + return SPDK_SUCCESS; +} + +/* If there are additional large writes queued for R2Ts, start them now. + * This is called when a large write is just completed or when multiple LUNs + * are attached and large write tasks for the specific LUN are cleared. + */ +static void +spdk_start_queued_transfer_tasks(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_task *task, *tmp; + + TAILQ_FOREACH_SAFE(task, &conn->queued_r2t_tasks, link, tmp) { + if (conn->pending_r2t < DEFAULT_MAXR2T) { + TAILQ_REMOVE(&conn->queued_r2t_tasks, task, link); + spdk_add_transfer_task(conn, task); + } else { + break; + } + } +} + +void spdk_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t task_tag) +{ + struct spdk_iscsi_task *task; + int i; + + for (i = 0; i < conn->pending_r2t; i++) { + if (conn->outstanding_r2t_tasks[i]->tag == task_tag) { + task = conn->outstanding_r2t_tasks[i]; + conn->data_out_cnt -= task->data_out_cnt; + + conn->pending_r2t--; + for (; i < conn->pending_r2t; i++) { + conn->outstanding_r2t_tasks[i] = conn->outstanding_r2t_tasks[i + 1]; + } + conn->outstanding_r2t_tasks[conn->pending_r2t] = NULL; + break; + } + } + + spdk_start_queued_transfer_tasks(conn); +} + +static void +spdk_del_connection_queued_task(struct spdk_iscsi_conn *conn, void *tailq, + struct spdk_scsi_lun *lun) +{ + struct spdk_iscsi_task *task, *task_tmp; + /* + * Temporary used to index spdk_scsi_task related + * queues of the connection. + */ + TAILQ_HEAD(queued_tasks, spdk_iscsi_task) *head; + head = (struct queued_tasks *)tailq; + + TAILQ_FOREACH_SAFE(task, head, link, task_tmp) { + if (lun == NULL || lun == task->scsi.lun) { + TAILQ_REMOVE(head, task, link); + if (lun != NULL && spdk_scsi_lun_is_removing(lun)) { + spdk_scsi_task_process_null_lun(&task->scsi); + spdk_iscsi_task_response(conn, task); + } + spdk_iscsi_task_put(task); + } + } +} + +void spdk_clear_all_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun) +{ + int i, j, pending_r2t; + struct spdk_iscsi_task *task; + + pending_r2t = conn->pending_r2t; + for (i = 0; i < pending_r2t; i++) { + task = conn->outstanding_r2t_tasks[i]; + if (lun == NULL || lun == task->scsi.lun) { + conn->outstanding_r2t_tasks[i] = NULL; + task->outstanding_r2t = 0; + task->next_r2t_offset = 0; + task->next_expected_r2t_offset = 0; + conn->data_out_cnt -= task->data_out_cnt; + conn->pending_r2t--; + } + } + + for (i = 0; i < pending_r2t; i++) { + if (conn->outstanding_r2t_tasks[i] != NULL) { + continue; + } + for (j = i + 1; j < pending_r2t; j++) { + if (conn->outstanding_r2t_tasks[j] != NULL) { + conn->outstanding_r2t_tasks[i] = conn->outstanding_r2t_tasks[j]; + conn->outstanding_r2t_tasks[j] = NULL; + break; + } + } + } + + spdk_del_connection_queued_task(conn, &conn->active_r2t_tasks, lun); + spdk_del_connection_queued_task(conn, &conn->queued_r2t_tasks, lun); + + spdk_start_queued_transfer_tasks(conn); +} + +/* This function is used to handle the r2t snack */ +static int +spdk_iscsi_handle_r2t_snack(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_pdu *pdu, uint32_t beg_run, + uint32_t run_length, int32_t task_tag) +{ + int32_t last_r2tsn; + int i; + + if (beg_run < task->acked_r2tsn) { + SPDK_ERRLOG("ITT: 0x%08x, R2T SNACK requests retransmission of" + "R2TSN: from 0x%08x to 0x%08x. But it has already" + "ack to R2TSN:0x%08x, protocol error.\n", + task_tag, beg_run, (beg_run + run_length), + (task->acked_r2tsn - 1)); + return spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (run_length) { + if ((beg_run + run_length) > task->R2TSN) { + SPDK_ERRLOG("ITT: 0x%08x, received R2T SNACK with" + "BegRun: 0x%08x, RunLength: 0x%08x, exceeds" + "current R2TSN: 0x%08x, protocol error.\n", + task_tag, beg_run, run_length, + task->R2TSN); + + return spdk_iscsi_reject(conn, pdu, + ISCSI_REASON_INVALID_PDU_FIELD); + } + last_r2tsn = (beg_run + run_length); + } else { + last_r2tsn = task->R2TSN; + } + + for (i = beg_run; i < last_r2tsn; i++) { + if (spdk_iscsi_send_r2t_recovery(conn, task, i, false) < 0) { + SPDK_ERRLOG("The r2t_sn=%d of r2t_task=%p is not sent\n", i, task); + } + } + return 0; +} + +/* This function is used to recover the data in packet */ +static int +spdk_iscsi_handle_recovery_datain(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + struct spdk_iscsi_pdu *pdu, uint32_t beg_run, + uint32_t run_length, uint32_t task_tag) +{ + struct spdk_iscsi_pdu *old_pdu, *pdu_temp; + uint32_t i; + struct iscsi_bhs_data_in *datain_header; + uint32_t last_statsn; + + task = spdk_iscsi_task_get_primary(task); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_handle_recovery_datain\n"); + + if (beg_run < task->acked_data_sn) { + SPDK_ERRLOG("ITT: 0x%08x, DATA IN SNACK requests retransmission of" + "DATASN: from 0x%08x to 0x%08x but already acked to " + "DATASN: 0x%08x protocol error\n", + task_tag, beg_run, + (beg_run + run_length), (task->acked_data_sn - 1)); + + return spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + } + + if (run_length == 0) { + /* as the DataSN begins at 0 */ + run_length = task->datain_datasn + 1; + } + + if ((beg_run + run_length - 1) > task->datain_datasn) { + SPDK_ERRLOG("Initiator requests BegRun: 0x%08x, RunLength:" + "0x%08x greater than maximum DataSN: 0x%08x.\n", + beg_run, run_length, task->datain_datasn); + + return -1; + } else { + last_statsn = beg_run + run_length - 1; + } + + for (i = beg_run; i <= last_statsn; i++) { + TAILQ_FOREACH_SAFE(old_pdu, &conn->snack_pdu_list, tailq, pdu_temp) { + if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_header = (struct iscsi_bhs_data_in *)&old_pdu->bhs; + if (from_be32(&datain_header->itt) == task_tag && + from_be32(&datain_header->data_sn) == i) { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + spdk_iscsi_conn_write_pdu(conn, old_pdu); + break; + } + } + } + } + return 0; +} + +/* This function is used to handle the status snack */ +static int +spdk_iscsi_handle_status_snack(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu) +{ + uint32_t beg_run; + uint32_t run_length; + struct iscsi_bhs_snack_req *reqh; + uint32_t i; + uint32_t last_statsn; + bool found_pdu; + struct spdk_iscsi_pdu *old_pdu; + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, conn->StatSN=" + "%d, conn->exp_statsn=%d\n", beg_run, run_length, + conn->StatSN, conn->exp_statsn); + + if (!beg_run) { + beg_run = conn->exp_statsn; + } else if (beg_run < conn->exp_statsn) { + SPDK_ERRLOG("Got Status SNACK Begrun: 0x%08x, RunLength: 0x%08x " + "but already got ExpStatSN: 0x%08x on CID:%hu.\n", + beg_run, run_length, conn->StatSN, conn->cid); + + return spdk_iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_PDU_FIELD); + } + + last_statsn = (!run_length) ? conn->StatSN : (beg_run + run_length); + + for (i = beg_run; i < last_statsn; i++) { + found_pdu = false; + TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) { + if (from_be32(&old_pdu->bhs.stat_sn) == i) { + found_pdu = true; + break; + } + } + + if (!found_pdu) { + SPDK_ERRLOG("Unable to find StatSN: 0x%08x. For a Status" + "SNACK, assuming this is a proactive SNACK " + "for an untransmitted StatSN, ignoring.\n", + beg_run); + } else { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + spdk_iscsi_conn_write_pdu(conn, old_pdu); + } + } + + return 0; +} + +/* This function is used to handle the data ack snack */ +static int +spdk_iscsi_handle_data_ack(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu) +{ + uint32_t transfer_tag; + uint32_t beg_run; + uint32_t run_length; + struct spdk_iscsi_pdu *old_pdu; + uint32_t old_datasn; + int rc; + struct iscsi_bhs_snack_req *reqh; + struct spdk_iscsi_task *task; + struct iscsi_bhs_data_in *datain_header; + struct spdk_iscsi_task *primary; + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + transfer_tag = from_be32(&reqh->ttt); + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + task = NULL; + datain_header = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d,transfer_tag=%d,run_len=%d\n", + beg_run, transfer_tag, run_length); + + task = spdk_get_scsi_task_from_ttt(conn, transfer_tag); + if (!task) { + SPDK_ERRLOG("Data ACK SNACK for TTT: 0x%08x is invalid.\n", + transfer_tag); + goto reject_return; + } + + primary = spdk_iscsi_task_get_primary(task); + if ((run_length != 0) || (beg_run < primary->acked_data_sn)) { + SPDK_ERRLOG("TTT: 0x%08x Data ACK SNACK BegRUN: %d is less than " + "the next expected acked DataSN: %d\n", + transfer_tag, beg_run, primary->acked_data_sn); + goto reject_return; + } + + primary->acked_data_sn = beg_run; + + /* To free the pdu */ + TAILQ_FOREACH(old_pdu, &conn->snack_pdu_list, tailq) { + if (old_pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + datain_header = (struct iscsi_bhs_data_in *) &old_pdu->bhs; + old_datasn = from_be32(&datain_header->data_sn); + if ((from_be32(&datain_header->ttt) == transfer_tag) && + (old_datasn == beg_run - 1)) { + TAILQ_REMOVE(&conn->snack_pdu_list, old_pdu, tailq); + if (old_pdu->task) { + spdk_iscsi_task_put(old_pdu->task); + } + spdk_put_pdu(old_pdu); + break; + } + } + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Received Data ACK SNACK for TTT: 0x%08x," + " updated acked DataSN to 0x%08x.\n", transfer_tag, + (task->acked_data_sn - 1)); + + return 0; + +reject_return: + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_INVALID_SNACK); + if (rc < 0) { + SPDK_ERRLOG("iscsi_reject() failed\n"); + return -1; + } + + return 0; +} + +/* This function is used to remove the r2t pdu from snack_pdu_list by < task, r2t_sn> info */ +static struct spdk_iscsi_pdu * +spdk_iscsi_remove_r2t_pdu_from_snack_list(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, + uint32_t r2t_sn) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_r2t *r2t_header; + + TAILQ_FOREACH(pdu, &conn->snack_pdu_list, tailq) { + if (pdu->bhs.opcode == ISCSI_OP_R2T) { + r2t_header = (struct iscsi_bhs_r2t *)&pdu->bhs; + if (pdu->task == task && + from_be32(&r2t_header->r2t_sn) == r2t_sn) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + return pdu; + } + } + } + + return NULL; +} + +/* This function is used re-send the r2t packet */ +static int +spdk_iscsi_send_r2t_recovery(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, uint32_t r2t_sn, + bool send_new_r2tsn) +{ + struct spdk_iscsi_pdu *pdu; + struct iscsi_bhs_r2t *rsph; + uint32_t transfer_len; + uint32_t len; + int rc; + + /* remove the r2t pdu from the snack_list */ + pdu = spdk_iscsi_remove_r2t_pdu_from_snack_list(conn, task, r2t_sn); + if (!pdu) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "No pdu is found\n"); + return -1; + } + + /* flag + * false: only need to re-send the old r2t with changing statsn + * true: we send a r2t with new r2tsn + */ + if (!send_new_r2tsn) { + to_be32(&pdu->bhs.stat_sn, conn->StatSN); + spdk_iscsi_conn_write_pdu(conn, pdu); + } else { + rsph = (struct iscsi_bhs_r2t *)&pdu->bhs; + transfer_len = from_be32(&rsph->desired_xfer_len); + + /* still need to increase the acked r2tsn */ + task->acked_r2tsn++; + len = DMIN32(conn->sess->MaxBurstLength, (transfer_len - + task->next_expected_r2t_offset)); + + /* remove the old_r2t_pdu */ + if (pdu->task) { + spdk_iscsi_task_put(pdu->task); + } + spdk_put_pdu(pdu); + + /* re-send a new r2t pdu */ + rc = spdk_iscsi_send_r2t(conn, task, task->next_expected_r2t_offset, + len, task->ttt, &task->R2TSN); + if (rc < 0) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + return 0; +} + +/* This function is used to handle the snack request from the initiator */ +static int +spdk_iscsi_op_snack(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + struct iscsi_bhs_snack_req *reqh; + struct spdk_iscsi_task *task; + int type; + uint32_t task_tag; + uint32_t beg_run; + uint32_t run_length; + int rc; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_SNACK not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + if (!conn->sess->ErrorRecoveryLevel) { + SPDK_ERRLOG("Got a SNACK request in ErrorRecoveryLevel=0\n"); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + if (rc < 0) { + SPDK_ERRLOG("iscsi_reject() failed\n"); + return -1; + } + return rc; + } + + type = reqh->flags & ISCSI_FLAG_SNACK_TYPE_MASK; + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "The value of type is %d\n", type); + + switch (type) { + case 0: + reqh = (struct iscsi_bhs_snack_req *)&pdu->bhs; + task_tag = from_be32(&reqh->itt); + beg_run = from_be32(&reqh->beg_run); + run_length = from_be32(&reqh->run_len); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "beg_run=%d, run_length=%d, " + "task_tag=%x, transfer_tag=%u\n", beg_run, + run_length, task_tag, from_be32(&reqh->ttt)); + + task = spdk_get_scsi_task_from_itt(conn, task_tag, + ISCSI_OP_SCSI_DATAIN); + if (task) { + return spdk_iscsi_handle_recovery_datain(conn, task, pdu, + beg_run, run_length, task_tag); + } + task = spdk_get_scsi_task_from_itt(conn, task_tag, ISCSI_OP_R2T); + if (task) { + return spdk_iscsi_handle_r2t_snack(conn, task, pdu, beg_run, + run_length, task_tag); + } + SPDK_ERRLOG("It is Neither datain nor r2t recovery request\n"); + rc = -1; + break; + case ISCSI_FLAG_SNACK_TYPE_STATUS: + rc = spdk_iscsi_handle_status_snack(conn, pdu); + break; + case ISCSI_FLAG_SNACK_TYPE_DATA_ACK: + rc = spdk_iscsi_handle_data_ack(conn, pdu); + break; + case ISCSI_FLAG_SNACK_TYPE_RDATA: + SPDK_ERRLOG("R-Data SNACK is Not Supported int spdk\n"); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + break; + default: + SPDK_ERRLOG("Unknown SNACK type %d, protocol error\n", type); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + break; + } + + return rc; +} + +/* This function is used to refree the pdu when it is acknowledged */ +static void +spdk_remove_acked_pdu(struct spdk_iscsi_conn *conn, + uint32_t ExpStatSN) +{ + struct spdk_iscsi_pdu *pdu, *pdu_temp; + uint32_t stat_sn; + + conn->exp_statsn = DMIN32(ExpStatSN, conn->StatSN); + TAILQ_FOREACH_SAFE(pdu, &conn->snack_pdu_list, tailq, pdu_temp) { + stat_sn = from_be32(&pdu->bhs.stat_sn); + if (SN32_LT(stat_sn, conn->exp_statsn)) { + TAILQ_REMOVE(&conn->snack_pdu_list, pdu, tailq); + spdk_iscsi_conn_free_pdu(conn, pdu); + } + } +} + +static int spdk_iscsi_op_data(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_pdu *pdu) +{ + struct spdk_iscsi_task *task, *subtask; + struct iscsi_bhs_data_out *reqh; + struct spdk_scsi_lun *lun_dev; + uint32_t transfer_tag; + uint32_t task_tag; + uint32_t transfer_len; + uint32_t DataSN; + uint32_t buffer_offset; + uint32_t len; + int F_bit; + int rc; + int reject_reason = ISCSI_REASON_INVALID_PDU_FIELD; + + if (conn->sess->session_type == SESSION_TYPE_DISCOVERY) { + SPDK_ERRLOG("ISCSI_OP_SCSI_DATAOUT not allowed in discovery session\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + reqh = (struct iscsi_bhs_data_out *)&pdu->bhs; + F_bit = !!(reqh->flags & ISCSI_FLAG_FINAL); + transfer_tag = from_be32(&reqh->ttt); + task_tag = from_be32(&reqh->itt); + DataSN = from_be32(&reqh->data_sn); + buffer_offset = from_be32(&reqh->buffer_offset); + + task = spdk_get_transfer_task(conn, transfer_tag); + if (task == NULL) { + SPDK_ERRLOG("Not found task for transfer_tag=%x\n", transfer_tag); + goto reject_return; + } + + lun_dev = spdk_scsi_dev_get_lun(conn->dev, task->lun_id); + + if (pdu->data_segment_len > task->desired_data_transfer_length) { + SPDK_ERRLOG("the dataout pdu data length is larger than the value sent by R2T PDU\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + if (task->tag != task_tag) { + SPDK_ERRLOG("The r2t task tag is %u, and the dataout task tag is %u\n", + task->tag, task_tag); + goto reject_return; + } + + if (DataSN != task->r2t_datasn) { + SPDK_ERRLOG("DataSN(%u) exp=%d error\n", DataSN, task->r2t_datasn); + if (conn->sess->ErrorRecoveryLevel >= 1) { + goto send_r2t_recovery_return; + } else { + reject_reason = ISCSI_REASON_PROTOCOL_ERROR; + goto reject_return; + } + } + + if (buffer_offset != task->next_expected_r2t_offset) { + SPDK_ERRLOG("offset(%u) error\n", buffer_offset); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + transfer_len = task->scsi.transfer_len; + task->current_r2t_length += pdu->data_segment_len; + task->next_expected_r2t_offset += pdu->data_segment_len; + task->r2t_datasn++; + + if (task->current_r2t_length > conn->sess->MaxBurstLength) { + SPDK_ERRLOG("R2T burst(%u) > MaxBurstLength(%u)\n", + task->current_r2t_length, + conn->sess->MaxBurstLength); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + if (F_bit) { + /* + * This R2T burst is done. Clear the length before we + * receive a PDU for the next R2T burst. + */ + task->current_r2t_length = 0; + } + + subtask = spdk_iscsi_task_get(conn, task, spdk_iscsi_task_cpl); + if (subtask == NULL) { + SPDK_ERRLOG("Unable to acquire subtask\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + subtask->scsi.offset = buffer_offset; + subtask->scsi.length = pdu->data_segment_len; + spdk_scsi_task_set_data(&subtask->scsi, pdu->data, pdu->data_segment_len); + spdk_iscsi_task_associate_pdu(subtask, pdu); + + if (task->next_expected_r2t_offset == transfer_len) { + task->acked_r2tsn++; + } else if (F_bit && (task->next_r2t_offset < transfer_len)) { + task->acked_r2tsn++; + len = DMIN32(conn->sess->MaxBurstLength, (transfer_len - + task->next_r2t_offset)); + rc = spdk_iscsi_send_r2t(conn, task, task->next_r2t_offset, len, + task->ttt, &task->R2TSN); + if (rc < 0) { + SPDK_ERRLOG("iscsi_send_r2t() failed\n"); + } + task->next_r2t_offset += len; + } + + if (lun_dev == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "LUN %d is removed, complete the task immediately\n", + task->lun_id); + subtask->scsi.transfer_len = subtask->scsi.length; + spdk_scsi_task_process_null_lun(&subtask->scsi); + spdk_iscsi_task_cpl(&subtask->scsi); + return 0; + } + + spdk_iscsi_queue_task(conn, subtask); + return 0; + +send_r2t_recovery_return: + rc = spdk_iscsi_send_r2t_recovery(conn, task, task->acked_r2tsn, true); + if (rc == 0) { + return 0; + } + +reject_return: + rc = spdk_iscsi_reject(conn, pdu, reject_reason); + if (rc < 0) { + SPDK_ERRLOG("iscsi_reject() failed\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + return SPDK_SUCCESS; +} + +static int +spdk_iscsi_send_r2t(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task, int offset, + int len, uint32_t transfer_tag, uint32_t *R2TSN) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_r2t *rsph; + + /* R2T PDU */ + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + rsph = (struct iscsi_bhs_r2t *)&rsp_pdu->bhs; + rsp_pdu->data = NULL; + rsph->opcode = ISCSI_OP_R2T; + rsph->flags |= 0x80; /* bit 0 is default to 1 */ + to_be64(&rsph->lun, task->lun_id); + to_be32(&rsph->itt, task->tag); + to_be32(&rsph->ttt, transfer_tag); + + to_be32(&rsph->stat_sn, conn->StatSN); + to_be32(&rsph->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsph->max_cmd_sn, conn->sess->MaxCmdSN); + + to_be32(&rsph->r2t_sn, *R2TSN); + *R2TSN += 1; + + task->r2t_datasn = 0; /* next expected datasn to ack */ + + to_be32(&rsph->buffer_offset, (uint32_t)offset); + to_be32(&rsph->desired_xfer_len, (uint32_t)len); + task->desired_data_transfer_length = (size_t)len; + + /* we need to hold onto this task/cmd because until the PDU has been + * written out */ + rsp_pdu->task = task; + task->scsi.ref++; + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + + return SPDK_SUCCESS; +} + +void spdk_iscsi_send_nopin(struct spdk_iscsi_conn *conn) +{ + struct spdk_iscsi_pdu *rsp_pdu; + struct iscsi_bhs_nop_in *rsp; + + /* Only send nopin if we have logged in and are in a normal session. */ + if (conn->sess == NULL || + !conn->full_feature || + !spdk_iscsi_param_eq_val(conn->sess->params, "SessionType", "Normal")) { + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "send NOPIN isid=%"PRIx64", tsih=%u, cid=%u\n", + conn->sess->isid, conn->sess->tsih, conn->cid); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN=%u, ExpCmdSN=%u, MaxCmdSN=%u\n", + conn->StatSN, conn->sess->ExpCmdSN, + conn->sess->MaxCmdSN); + + rsp_pdu = spdk_get_pdu(); + rsp = (struct iscsi_bhs_nop_in *) &rsp_pdu->bhs; + rsp_pdu->data = NULL; + + /* + * spdk_get_pdu() memset's the PDU for us, so only fill out the needed + * fields. + */ + rsp->opcode = ISCSI_OP_NOPIN; + rsp->flags = 0x80; + /* + * Technically the to_be32() is not needed here, since + * to_be32(0xFFFFFFFU) returns 0xFFFFFFFFU. + */ + to_be32(&rsp->itt, 0xFFFFFFFFU); + to_be32(&rsp->ttt, conn->id); + to_be32(&rsp->stat_sn, conn->StatSN); + to_be32(&rsp->exp_cmd_sn, conn->sess->ExpCmdSN); + to_be32(&rsp->max_cmd_sn, conn->sess->MaxCmdSN); + + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + conn->last_nopin = spdk_get_ticks(); + conn->nop_outstanding = true; +} + +static void +spdk_init_login_reject_response(struct spdk_iscsi_pdu *pdu, struct spdk_iscsi_pdu *rsp_pdu) +{ + struct iscsi_bhs_login_rsp *rsph; + + memset(rsp_pdu, 0, sizeof(struct spdk_iscsi_pdu)); + rsph = (struct iscsi_bhs_login_rsp *)&rsp_pdu->bhs; + rsph->version_max = ISCSI_VERSION; + rsph->version_act = ISCSI_VERSION; + rsph->opcode = ISCSI_OP_LOGIN_RSP; + rsph->status_class = ISCSI_CLASS_INITIATOR_ERROR; + rsph->status_detail = ISCSI_LOGIN_INVALID_LOGIN_REQUEST; + rsph->itt = pdu->bhs.itt; +} + +int +spdk_iscsi_execute(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu) +{ + int opcode; + int rc; + struct spdk_iscsi_pdu *rsp_pdu = NULL; + uint32_t ExpStatSN; + uint32_t QCmdSN; + int I_bit; + struct spdk_iscsi_sess *sess; + struct iscsi_bhs_scsi_req *reqh; + + if (pdu == NULL) { + return -1; + } + + opcode = pdu->bhs.opcode; + reqh = (struct iscsi_bhs_scsi_req *)&pdu->bhs; + pdu->cmd_sn = from_be32(&reqh->cmd_sn); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "opcode %x\n", opcode); + + if (opcode == ISCSI_OP_LOGIN) { + rc = spdk_iscsi_op_login(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("iscsi_op_login() failed\n"); + } + return rc; + } + + /* connection in login phase but receive non-login opcode + * return response code 0x020b to initiator. + * */ + if (!conn->full_feature && conn->state == ISCSI_CONN_STATE_RUNNING) { + rsp_pdu = spdk_get_pdu(); + if (rsp_pdu == NULL) { + return SPDK_ISCSI_CONNECTION_FATAL; + } + spdk_init_login_reject_response(pdu, rsp_pdu); + spdk_iscsi_conn_write_pdu(conn, rsp_pdu); + SPDK_ERRLOG("Received opcode %d in login phase\n", opcode); + return SPDK_ISCSI_LOGIN_ERROR_RESPONSE; + } else if (conn->state == ISCSI_CONN_STATE_INVALID) { + SPDK_ERRLOG("before Full Feature\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + + sess = conn->sess; + if (!sess) { + SPDK_ERRLOG("Connection has no associated session!\n"); + return SPDK_ISCSI_CONNECTION_FATAL; + } + I_bit = reqh->immediate; + if (I_bit == 0) { + if (SN32_LT(pdu->cmd_sn, sess->ExpCmdSN) || + SN32_GT(pdu->cmd_sn, sess->MaxCmdSN)) { + if (sess->session_type == SESSION_TYPE_NORMAL && + opcode != ISCSI_OP_SCSI_DATAOUT) { + SPDK_ERRLOG("CmdSN(%u) ignore (ExpCmdSN=%u, MaxCmdSN=%u)\n", + pdu->cmd_sn, sess->ExpCmdSN, sess->MaxCmdSN); + + if (sess->ErrorRecoveryLevel >= 1) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n"); + } else { + return SPDK_PDU_FATAL; + } + } + } + } else if (pdu->cmd_sn != sess->ExpCmdSN) { + SPDK_ERRLOG("CmdSN(%u) error ExpCmdSN=%u\n", pdu->cmd_sn, sess->ExpCmdSN); + + if (sess->ErrorRecoveryLevel >= 1) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Skip the error in ERL 1 and 2\n"); + } else if (opcode != ISCSI_OP_NOPOUT) { + /* + * The Linux initiator does not send valid CmdSNs for + * nopout under heavy load, so do not close the + * connection in that case. + */ + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + ExpStatSN = from_be32(&reqh->exp_stat_sn); + if (SN32_GT(ExpStatSN, conn->StatSN)) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "StatSN(%u) advanced\n", ExpStatSN); + ExpStatSN = conn->StatSN; + } + + if (sess->ErrorRecoveryLevel >= 1) { + spdk_remove_acked_pdu(conn, ExpStatSN); + } + + if (opcode == ISCSI_OP_NOPOUT || opcode == ISCSI_OP_SCSI) { + QCmdSN = sess->MaxCmdSN - sess->ExpCmdSN + 1; + QCmdSN += sess->queue_depth; + if (SN32_LT(ExpStatSN + QCmdSN, conn->StatSN)) { + SPDK_ERRLOG("StatSN(%u/%u) QCmdSN(%u) error\n", + ExpStatSN, conn->StatSN, QCmdSN); + return SPDK_ISCSI_CONNECTION_FATAL; + } + } + + if (!I_bit && opcode != ISCSI_OP_SCSI_DATAOUT) { + sess->ExpCmdSN++; + } + + switch (opcode) { + case ISCSI_OP_NOPOUT: + rc = spdk_iscsi_op_nopout(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_nopout() failed\n"); + return rc; + } + break; + + case ISCSI_OP_SCSI: + rc = spdk_iscsi_op_scsi(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_scsi() failed\n"); + return rc; + } + break; + case ISCSI_OP_TASK: + rc = spdk_iscsi_op_task(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_task() failed\n"); + return rc; + } + break; + + case ISCSI_OP_TEXT: + rc = spdk_iscsi_op_text(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_text() failed\n"); + return rc; + } + break; + + case ISCSI_OP_LOGOUT: + rc = spdk_iscsi_op_logout(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_logout() failed\n"); + return rc; + } + break; + + case ISCSI_OP_SCSI_DATAOUT: + rc = spdk_iscsi_op_data(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_data() failed\n"); + return rc; + } + break; + + case ISCSI_OP_SNACK: + rc = spdk_iscsi_op_snack(conn, pdu); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_op_snack() failed\n"); + return rc; + } + break; + + default: + SPDK_ERRLOG("unsupported opcode %x\n", opcode); + rc = spdk_iscsi_reject(conn, pdu, ISCSI_REASON_PROTOCOL_ERROR); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_reject() failed\n"); + return rc; + } + break; + } + + return 0; +} + +void spdk_free_sess(struct spdk_iscsi_sess *sess) +{ + if (sess == NULL) { + return; + } + + sess->tag = 0; + sess->target = NULL; + sess->session_type = SESSION_TYPE_INVALID; + spdk_iscsi_param_free(sess->params); + free(sess->conns); + spdk_scsi_port_free(&sess->initiator_port); + spdk_mempool_put(g_spdk_iscsi.session_pool, (void *)sess); +} + +static int +spdk_create_iscsi_sess(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, + enum session_type session_type) +{ + struct spdk_iscsi_sess *sess; + int rc; + + sess = spdk_mempool_get(g_spdk_iscsi.session_pool); + if (!sess) { + SPDK_ERRLOG("Unable to get session object\n"); + SPDK_ERRLOG("MaxSessions set to %d\n", g_spdk_iscsi.MaxSessions); + return -ENOMEM; + } + + /* configuration values */ + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + sess->MaxConnections = g_spdk_iscsi.MaxConnectionsPerSession; + sess->MaxOutstandingR2T = DEFAULT_MAXOUTSTANDINGR2T; + + sess->DefaultTime2Wait = g_spdk_iscsi.DefaultTime2Wait; + sess->DefaultTime2Retain = g_spdk_iscsi.DefaultTime2Retain; + sess->FirstBurstLength = g_spdk_iscsi.FirstBurstLength; + sess->MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + sess->InitialR2T = DEFAULT_INITIALR2T; + sess->ImmediateData = g_spdk_iscsi.ImmediateData; + sess->DataPDUInOrder = DEFAULT_DATAPDUINORDER; + sess->DataSequenceInOrder = DEFAULT_DATASEQUENCEINORDER; + sess->ErrorRecoveryLevel = g_spdk_iscsi.ErrorRecoveryLevel; + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + sess->tag = conn->portal->group->tag; + + sess->conns = calloc(sess->MaxConnections, sizeof(*sess->conns)); + if (!sess->conns) { + SPDK_ERRLOG("calloc() failed for connection array\n"); + return -ENOMEM; + } + + sess->connections = 0; + + sess->conns[sess->connections] = conn; + sess->connections++; + + sess->params = NULL; + sess->target = NULL; + sess->isid = 0; + sess->session_type = session_type; + sess->current_text_itt = 0xffffffffU; + + /* set default params */ + rc = spdk_iscsi_sess_params_init(&sess->params); + if (rc < 0) { + SPDK_ERRLOG("iscsi_sess_params_init() failed\n"); + goto error_return; + } + /* replace with config value */ + rc = spdk_iscsi_param_set_int(sess->params, "MaxConnections", + sess->MaxConnections); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "MaxOutstandingR2T", + sess->MaxOutstandingR2T); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "DefaultTime2Wait", + sess->DefaultTime2Wait); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "DefaultTime2Retain", + sess->DefaultTime2Retain); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "FirstBurstLength", + sess->FirstBurstLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "MaxBurstLength", + sess->MaxBurstLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set(sess->params, "InitialR2T", + sess->InitialR2T ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set(sess->params, "ImmediateData", + sess->ImmediateData ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set(sess->params, "DataPDUInOrder", + sess->DataPDUInOrder ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set(sess->params, "DataSequenceInOrder", + sess->DataSequenceInOrder ? "Yes" : "No"); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set() failed\n"); + goto error_return; + } + + rc = spdk_iscsi_param_set_int(sess->params, "ErrorRecoveryLevel", + sess->ErrorRecoveryLevel); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + /* realloc buffer */ + rc = spdk_iscsi_param_set_int(conn->params, "MaxRecvDataSegmentLength", + conn->MaxRecvDataSegmentLength); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_set_int() failed\n"); + goto error_return; + } + + /* sess for first connection of session */ + conn->sess = sess; + return 0; + +error_return: + spdk_free_sess(sess); + conn->sess = NULL; + return -1; +} + +static struct spdk_iscsi_sess * +spdk_get_iscsi_sess_by_tsih(uint16_t tsih) +{ + struct spdk_iscsi_sess *session; + + if (tsih == 0 || tsih > g_spdk_iscsi.MaxSessions) { + return NULL; + } + + session = g_spdk_iscsi.session[tsih - 1]; + assert(tsih == session->tsih); + + return session; +} + +static int +spdk_append_iscsi_sess(struct spdk_iscsi_conn *conn, + const char *initiator_port_name, uint16_t tsih, uint16_t cid) +{ + struct spdk_iscsi_sess *sess; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "append session: init port name=%s, tsih=%u, cid=%u\n", + initiator_port_name, tsih, cid); + + sess = spdk_get_iscsi_sess_by_tsih(tsih); + if (sess == NULL) { + SPDK_ERRLOG("spdk_get_iscsi_sess_by_tsih failed\n"); + return -1; + } + if ((conn->portal->group->tag != sess->tag) || + (strcasecmp(initiator_port_name, spdk_scsi_port_get_name(sess->initiator_port)) != 0) || + (conn->target != sess->target)) { + /* no match */ + SPDK_ERRLOG("no MCS session for init port name=%s, tsih=%d, cid=%d\n", + initiator_port_name, tsih, cid); + return -1; + } + + if (sess->connections >= sess->MaxConnections) { + /* no slot for connection */ + SPDK_ERRLOG("too many connections for init port name=%s, tsih=%d, cid=%d\n", + initiator_port_name, tsih, cid); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Connections (tsih %d): %d\n", sess->tsih, sess->connections); + conn->sess = sess; + + /* + * TODO: need a mutex or other sync mechanism to protect the session's + * connection list. + */ + sess->conns[sess->connections] = conn; + sess->connections++; + + return 0; +} + +bool spdk_iscsi_is_deferred_free_pdu(struct spdk_iscsi_pdu *pdu) +{ + if (pdu == NULL) { + return false; + } + + if (pdu->bhs.opcode == ISCSI_OP_R2T || + pdu->bhs.opcode == ISCSI_OP_SCSI_DATAIN) { + return true; + } + + return false; +} diff --git a/src/spdk/lib/iscsi/iscsi.h b/src/spdk/lib/iscsi/iscsi.h new file mode 100644 index 00000000..3cfb20fc --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi.h @@ -0,0 +1,467 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_H +#define SPDK_ISCSI_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/iscsi_spec.h" +#include "spdk/event.h" +#include "spdk/thread.h" + +#include "iscsi/param.h" +#include "iscsi/tgt_node.h" + +#include "spdk/assert.h" +#include "spdk/util.h" + +#define SPDK_ISCSI_DEFAULT_NODEBASE "iqn.2016-06.io.spdk" + +#define DEFAULT_MAXR2T 4 +#define MAX_INITIATOR_NAME 256 +#define MAX_TARGET_NAME 256 + +#define MAX_PORTAL 1024 +#define MAX_INITIATOR 256 +#define MAX_NETMASK 256 +#define MAX_SESSIONS 1024 +#define MAX_ISCSI_CONNECTIONS MAX_SESSIONS +#define MAX_FIRSTBURSTLENGTH 16777215 + +#define DEFAULT_PORT 3260 +#define DEFAULT_MAX_SESSIONS 128 +#define DEFAULT_MAX_CONNECTIONS_PER_SESSION 2 +#define DEFAULT_MAXOUTSTANDINGR2T 1 +#define DEFAULT_DEFAULTTIME2WAIT 2 +#define DEFAULT_DEFAULTTIME2RETAIN 20 +#define DEFAULT_FIRSTBURSTLENGTH 8192 +#define DEFAULT_INITIALR2T true +#define DEFAULT_IMMEDIATEDATA true +#define DEFAULT_DATAPDUINORDER true +#define DEFAULT_DATASEQUENCEINORDER true +#define DEFAULT_ERRORRECOVERYLEVEL 0 +#define DEFAULT_TIMEOUT 60 +#define MAX_NOPININTERVAL 60 +#define DEFAULT_NOPININTERVAL 30 +#define DEFAULT_CONNECTIONS_PER_LCORE 4 + +/* + * SPDK iSCSI target currently only supports 64KB as the maximum data segment length + * it can receive from initiators. Other values may work, but no guarantees. + */ +#define SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH 65536 + +/* + * SPDK iSCSI target will only send a maximum of SPDK_BDEV_LARGE_BUF_MAX_SIZE data segments, even if the + * connection can support more. + */ +#define SPDK_ISCSI_MAX_SEND_DATA_SEGMENT_LENGTH SPDK_BDEV_LARGE_BUF_MAX_SIZE + +/* + * Defines maximum number of data out buffers each connection can have in + * use at any given time. + */ +#define MAX_DATA_OUT_PER_CONNECTION 16 + +/* + * Defines maximum number of data in buffers each connection can have in + * use at any given time. So this limit does not affect I/O smaller than + * SPDK_BDEV_SMALL_BUF_MAX_SIZE. + */ +#define MAX_LARGE_DATAIN_PER_CONNECTION 64 + +/* + * Defines default maximum queue depth per connection and this can be + * changed by configuration file. + */ +#define DEFAULT_MAX_QUEUE_DEPTH 64 + +#define SPDK_ISCSI_MAX_BURST_LENGTH \ + (SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH * MAX_DATA_OUT_PER_CONNECTION) + +/* + * Defines default maximum amount in bytes of unsolicited data the iSCSI + * initiator may send to the SPDK iSCSI target during the execution of + * a single SCSI command. And it is smaller than the MaxBurstLength. + */ +#define SPDK_ISCSI_FIRST_BURST_LENGTH 8192 + +/* + * Defines minimum amount in bytes of unsolicited data the iSCSI initiator + * may send to the SPDK iSCSI target during the execution of a single + * SCSI command. + */ +#define SPDK_ISCSI_MIN_FIRST_BURST_LENGTH 512 + +/** Defines how long we should wait for a TCP close after responding to a + * logout request, before terminating the connection ourselves. + */ +#define ISCSI_LOGOUT_TIMEOUT 5 /* in seconds */ + +/* according to RFC1982 */ +#define SN32_CMPMAX (((uint32_t)1U) << (32 - 1)) +#define SN32_LT(S1,S2) \ + (((uint32_t)(S1) != (uint32_t)(S2)) \ + && (((uint32_t)(S1) < (uint32_t)(S2) \ + && ((uint32_t)(S2) - (uint32_t)(S1) < SN32_CMPMAX)) \ + || ((uint32_t)(S1) > (uint32_t)(S2) \ + && ((uint32_t)(S1) - (uint32_t)(S2) > SN32_CMPMAX)))) +#define SN32_GT(S1,S2) \ + (((uint32_t)(S1) != (uint32_t)(S2)) \ + && (((uint32_t)(S1) < (uint32_t)(S2) \ + && ((uint32_t)(S2) - (uint32_t)(S1) > SN32_CMPMAX)) \ + || ((uint32_t)(S1) > (uint32_t)(S2) \ + && ((uint32_t)(S1) - (uint32_t)(S2) < SN32_CMPMAX)))) + +/* For spdk_iscsi_login_in related function use, we need to avoid the conflict + * with other errors + * */ +#define SPDK_ISCSI_LOGIN_ERROR_RESPONSE -1000 +#define SPDK_ISCSI_LOGIN_ERROR_PARAMETER -1001 +#define SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE -1002 + +#define ISCSI_AHS_LEN 60 + +struct spdk_mobj { + struct spdk_mempool *mp; + void *buf; + size_t len; + uint64_t reserved; /* do not use */ +}; + +struct spdk_iscsi_pdu { + struct iscsi_bhs bhs; + struct spdk_mobj *mobj; + uint8_t *data_buf; + uint8_t *data; + uint8_t header_digest[ISCSI_DIGEST_LEN]; + uint8_t data_digest[ISCSI_DIGEST_LEN]; + size_t data_segment_len; + int bhs_valid_bytes; + int ahs_valid_bytes; + int data_valid_bytes; + int hdigest_valid_bytes; + int ddigest_valid_bytes; + int ref; + bool data_from_mempool; /* indicate whether the data buffer is allocated from mempool */ + struct spdk_iscsi_task *task; /* data tied to a task buffer */ + uint32_t cmd_sn; + uint32_t writev_offset; + TAILQ_ENTRY(spdk_iscsi_pdu) tailq; + + + /* + * 60 bytes of AHS should suffice for now. + * This should always be at the end of PDU data structure. + * we need to not zero this out when doing memory clear. + */ + uint8_t ahs[ISCSI_AHS_LEN]; + + struct { + uint16_t length; /* iSCSI SenseLength (big-endian) */ + uint8_t data[32]; + } sense; +}; + +enum iscsi_connection_state { + ISCSI_CONN_STATE_INVALID = 0, + ISCSI_CONN_STATE_RUNNING = 1, + ISCSI_CONN_STATE_LOGGED_OUT = 2, + ISCSI_CONN_STATE_EXITING = 3, + ISCSI_CONN_STATE_EXITED = 4, +}; + +enum iscsi_chap_phase { + ISCSI_CHAP_PHASE_NONE = 0, + ISCSI_CHAP_PHASE_WAIT_A = 1, + ISCSI_CHAP_PHASE_WAIT_NR = 2, + ISCSI_CHAP_PHASE_END = 3, +}; + +enum session_type { + SESSION_TYPE_INVALID = 0, + SESSION_TYPE_NORMAL = 1, + SESSION_TYPE_DISCOVERY = 2, +}; + +#define ISCSI_CHAP_CHALLENGE_LEN 1024 +#define ISCSI_CHAP_MAX_USER_LEN 255 +#define ISCSI_CHAP_MAX_SECRET_LEN 255 + +struct iscsi_chap_auth { + enum iscsi_chap_phase chap_phase; + + char user[ISCSI_CHAP_MAX_USER_LEN + 1]; + char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + char muser[ISCSI_CHAP_MAX_USER_LEN + 1]; + char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + + uint8_t chap_id[1]; + uint8_t chap_mid[1]; + int chap_challenge_len; + uint8_t chap_challenge[ISCSI_CHAP_CHALLENGE_LEN]; + int chap_mchallenge_len; + uint8_t chap_mchallenge[ISCSI_CHAP_CHALLENGE_LEN]; +}; + +struct spdk_iscsi_auth_secret { + char user[ISCSI_CHAP_MAX_USER_LEN + 1]; + char secret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + char muser[ISCSI_CHAP_MAX_USER_LEN + 1]; + char msecret[ISCSI_CHAP_MAX_SECRET_LEN + 1]; + TAILQ_ENTRY(spdk_iscsi_auth_secret) tailq; +}; + +struct spdk_iscsi_auth_group { + int32_t tag; + TAILQ_HEAD(, spdk_iscsi_auth_secret) secret_head; + TAILQ_ENTRY(spdk_iscsi_auth_group) tailq; +}; + +struct spdk_iscsi_sess { + uint32_t connections; + struct spdk_iscsi_conn **conns; + + struct spdk_scsi_port *initiator_port; + int tag; + + uint64_t isid; + uint16_t tsih; + struct spdk_iscsi_tgt_node *target; + int queue_depth; + + struct iscsi_param *params; + + enum session_type session_type; + uint32_t MaxConnections; + uint32_t MaxOutstandingR2T; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + bool InitialR2T; + bool ImmediateData; + bool DataPDUInOrder; + bool DataSequenceInOrder; + uint32_t ErrorRecoveryLevel; + + uint32_t ExpCmdSN; + uint32_t MaxCmdSN; + + uint32_t current_text_itt; +}; + +struct spdk_iscsi_poll_group { + uint32_t core; + struct spdk_poller *poller; + struct spdk_poller *nop_poller; + STAILQ_HEAD(connections, spdk_iscsi_conn) connections; + struct spdk_sock_group *sock_group; +}; + +struct spdk_iscsi_opts { + char *authfile; + char *nodebase; + int32_t timeout; + int32_t nopininterval; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + uint32_t MaxSessions; + uint32_t MaxConnectionsPerSession; + uint32_t MaxConnections; + uint32_t MaxQueueDepth; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + bool ImmediateData; + uint32_t ErrorRecoveryLevel; + bool AllowDuplicateIsid; + uint32_t min_connections_per_core; +}; + +struct spdk_iscsi_globals { + char *authfile; + char *nodebase; + pthread_mutex_t mutex; + TAILQ_HEAD(, spdk_iscsi_portal) portal_head; + TAILQ_HEAD(, spdk_iscsi_portal_grp) pg_head; + TAILQ_HEAD(, spdk_iscsi_init_grp) ig_head; + TAILQ_HEAD(, spdk_iscsi_tgt_node) target_head; + TAILQ_HEAD(, spdk_iscsi_auth_group) auth_group_head; + + int32_t timeout; + int32_t nopininterval; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + + uint32_t MaxSessions; + uint32_t MaxConnectionsPerSession; + uint32_t MaxConnections; + uint32_t MaxQueueDepth; + uint32_t DefaultTime2Wait; + uint32_t DefaultTime2Retain; + uint32_t FirstBurstLength; + bool ImmediateData; + uint32_t ErrorRecoveryLevel; + bool AllowDuplicateIsid; + + struct spdk_mempool *pdu_pool; + struct spdk_mempool *pdu_immediate_data_pool; + struct spdk_mempool *pdu_data_out_pool; + struct spdk_mempool *session_pool; + struct spdk_mempool *task_pool; + + struct spdk_iscsi_sess **session; + struct spdk_iscsi_poll_group *poll_group; +}; + +#define ISCSI_SECURITY_NEGOTIATION_PHASE 0 +#define ISCSI_OPERATIONAL_NEGOTIATION_PHASE 1 +#define ISCSI_NSG_RESERVED_CODE 2 +#define ISCSI_FULL_FEATURE_PHASE 3 + +enum spdk_error_codes { + SPDK_SUCCESS = 0, + SPDK_ISCSI_CONNECTION_FATAL = -1, + SPDK_PDU_FATAL = -2, +}; + +#define DGET24(B) \ + ((( (uint32_t) *((uint8_t *)(B)+0)) << 16) \ + | (((uint32_t) *((uint8_t *)(B)+1)) << 8) \ + | (((uint32_t) *((uint8_t *)(B)+2)) << 0)) + +#define DSET24(B,D) \ + (((*((uint8_t *)(B)+0)) = (uint8_t)((uint32_t)(D) >> 16)), \ + ((*((uint8_t *)(B)+1)) = (uint8_t)((uint32_t)(D) >> 8)), \ + ((*((uint8_t *)(B)+2)) = (uint8_t)((uint32_t)(D) >> 0))) + +#define xstrdup(s) (s ? strdup(s) : (char *)NULL) + +extern struct spdk_iscsi_globals g_spdk_iscsi; +extern struct spdk_iscsi_opts *g_spdk_iscsi_opts; + +struct spdk_iscsi_task; +struct spdk_json_write_ctx; + +typedef void (*spdk_iscsi_init_cb)(void *cb_arg, int rc); + +void spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg); +typedef void (*spdk_iscsi_fini_cb)(void *arg); +void spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg); +void spdk_shutdown_iscsi_conns_done(void); +void spdk_iscsi_config_text(FILE *fp); +void spdk_iscsi_config_json(struct spdk_json_write_ctx *w); + +struct spdk_iscsi_opts *spdk_iscsi_opts_alloc(void); +void spdk_iscsi_opts_free(struct spdk_iscsi_opts *opts); +struct spdk_iscsi_opts *spdk_iscsi_opts_copy(struct spdk_iscsi_opts *src); +void spdk_iscsi_opts_info_json(struct spdk_json_write_ctx *w); +int spdk_iscsi_set_discovery_auth(bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group); +int spdk_iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser, + int ag_tag); +int spdk_iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group); +struct spdk_iscsi_auth_group *spdk_iscsi_find_auth_group_by_tag(int32_t tag); +void spdk_iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group); +int spdk_iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group, + const char *user, const char *secret, + const char *muser, const char *msecret); +int spdk_iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group, + const char *user); +void spdk_iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w); + +void spdk_iscsi_send_nopin(struct spdk_iscsi_conn *conn); +void spdk_iscsi_task_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task); +int spdk_iscsi_execute(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu *pdu); +int spdk_iscsi_build_iovecs(struct spdk_iscsi_conn *conn, + struct iovec *iovec, struct spdk_iscsi_pdu *pdu); +int +spdk_iscsi_read_pdu(struct spdk_iscsi_conn *conn, struct spdk_iscsi_pdu **_pdu); +void spdk_iscsi_task_mgmt_response(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *task); + +int spdk_iscsi_conn_params_init(struct iscsi_param **params); +int spdk_iscsi_sess_params_init(struct iscsi_param **params); + +void spdk_free_sess(struct spdk_iscsi_sess *sess); +void spdk_clear_all_transfer_task(struct spdk_iscsi_conn *conn, + struct spdk_scsi_lun *lun); +void spdk_del_transfer_task(struct spdk_iscsi_conn *conn, uint32_t CmdSN); +bool spdk_iscsi_is_deferred_free_pdu(struct spdk_iscsi_pdu *pdu); + +int spdk_iscsi_negotiate_params(struct spdk_iscsi_conn *conn, + struct iscsi_param **params_p, uint8_t *data, + int alloc_len, int data_len); +int spdk_iscsi_copy_param2var(struct spdk_iscsi_conn *conn); + +void spdk_iscsi_task_cpl(struct spdk_scsi_task *scsi_task); +void spdk_iscsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task); + +/* Memory management */ +void spdk_put_pdu(struct spdk_iscsi_pdu *pdu); +struct spdk_iscsi_pdu *spdk_get_pdu(void); +int spdk_iscsi_conn_handle_queued_datain_tasks(struct spdk_iscsi_conn *conn); + +static inline int +spdk_get_immediate_data_buffer_size(void) +{ + /* + * Specify enough extra space in addition to FirstBurstLength to + * account for a header digest, data digest and additional header + * segments (AHS). These are not normally used but they do not + * take up much space and we need to make sure the worst-case scenario + * can be satisified by the size returned here. + */ + return g_spdk_iscsi.FirstBurstLength + + ISCSI_DIGEST_LEN + /* data digest */ + ISCSI_DIGEST_LEN + /* header digest */ + 8 + /* bidirectional AHS */ + 52; /* extended CDB AHS (for a 64-byte CDB) */ +} + +static inline int +spdk_get_data_out_buffer_size(void) +{ + return SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH; +} + +#endif /* SPDK_ISCSI_H */ diff --git a/src/spdk/lib/iscsi/iscsi_rpc.c b/src/spdk/lib/iscsi/iscsi_rpc.c new file mode 100644 index 00000000..dd9777a3 --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi_rpc.c @@ -0,0 +1,1542 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" +#include "iscsi/init_grp.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/event.h" +#include "spdk/string.h" +#include "spdk_internal/log.h" + +static void +spdk_rpc_get_initiator_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_initiator_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + spdk_iscsi_init_grps_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_initiator_groups", spdk_rpc_get_initiator_groups, SPDK_RPC_RUNTIME) + +struct rpc_initiator_list { + size_t num_initiators; + char *initiators[MAX_INITIATOR]; +}; + +static int +decode_rpc_initiator_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_initiator_list *list = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, list->initiators, MAX_INITIATOR, + &list->num_initiators, sizeof(char *)); +} + +static void +free_rpc_initiator_list(struct rpc_initiator_list *list) +{ + size_t i; + + for (i = 0; i < list->num_initiators; i++) { + free(list->initiators[i]); + } +} + +struct rpc_netmask_list { + size_t num_netmasks; + char *netmasks[MAX_NETMASK]; +}; + +static int +decode_rpc_netmask_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_netmask_list *list = out; + + return spdk_json_decode_array(val, spdk_json_decode_string, list->netmasks, MAX_NETMASK, + &list->num_netmasks, sizeof(char *)); +} + +static void +free_rpc_netmask_list(struct rpc_netmask_list *list) +{ + size_t i; + + for (i = 0; i < list->num_netmasks; i++) { + free(list->netmasks[i]); + } +} + +struct rpc_initiator_group { + int32_t tag; + struct rpc_initiator_list initiator_list; + struct rpc_netmask_list netmask_list; +}; + +static void +free_rpc_initiator_group(struct rpc_initiator_group *ig) +{ + free_rpc_initiator_list(&ig->initiator_list); + free_rpc_netmask_list(&ig->netmask_list); +} + +static const struct spdk_json_object_decoder rpc_initiator_group_decoders[] = { + {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32}, + {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list}, + {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list}, +}; + +static void +spdk_rpc_add_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_initiator_group_decoders, + SPDK_COUNTOF(rpc_initiator_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.initiator_list.num_initiators == 0 || + req.netmask_list.num_netmasks == 0) { + goto invalid; + } + + if (spdk_iscsi_init_grp_create_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("create_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("add_initiator_group", spdk_rpc_add_initiator_group, SPDK_RPC_RUNTIME) + +static const struct spdk_json_object_decoder rpc_add_or_delete_initiators_decoders[] = { + {"tag", offsetof(struct rpc_initiator_group, tag), spdk_json_decode_int32}, + {"initiators", offsetof(struct rpc_initiator_group, initiator_list), decode_rpc_initiator_list, true}, + {"netmasks", offsetof(struct rpc_initiator_group, netmask_list), decode_rpc_netmask_list, true}, +}; + +static void +spdk_rpc_add_initiators_to_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders, + SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (spdk_iscsi_init_grp_add_initiators_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("add_initiators_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("add_initiators_to_initiator_group", + spdk_rpc_add_initiators_to_initiator_group, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_delete_initiators_from_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_initiator_group req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_add_or_delete_initiators_decoders, + SPDK_COUNTOF(rpc_add_or_delete_initiators_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (spdk_iscsi_init_grp_delete_initiators_from_initiator_list(req.tag, + req.initiator_list.num_initiators, + req.initiator_list.initiators, + req.netmask_list.num_netmasks, + req.netmask_list.netmasks)) { + SPDK_ERRLOG("delete_initiators_from_initiator_list failed\n"); + goto invalid; + } + + free_rpc_initiator_group(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_initiator_group(&req); +} +SPDK_RPC_REGISTER("delete_initiators_from_initiator_group", + spdk_rpc_delete_initiators_from_initiator_group, SPDK_RPC_RUNTIME) + +struct rpc_delete_initiator_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_delete_initiator_group_decoders[] = { + {"tag", offsetof(struct rpc_delete_initiator_group, tag), spdk_json_decode_int32}, +}; + +static void +spdk_rpc_delete_initiator_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_initiator_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_init_grp *ig; + + if (spdk_json_decode_object(params, rpc_delete_initiator_group_decoders, + SPDK_COUNTOF(rpc_delete_initiator_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + ig = spdk_iscsi_init_grp_unregister(req.tag); + if (!ig) { + goto invalid; + } + spdk_iscsi_tgt_node_delete_map(NULL, ig); + spdk_iscsi_init_grp_destroy(ig); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("delete_initiator_group", spdk_rpc_delete_initiator_group, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_target_nodes(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_target_nodes requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + spdk_iscsi_tgt_nodes_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_target_nodes", spdk_rpc_get_target_nodes, SPDK_RPC_RUNTIME) + +struct rpc_pg_ig_map { + int32_t pg_tag; + int32_t ig_tag; +}; + +static const struct spdk_json_object_decoder rpc_pg_ig_map_decoders[] = { + {"pg_tag", offsetof(struct rpc_pg_ig_map, pg_tag), spdk_json_decode_int32}, + {"ig_tag", offsetof(struct rpc_pg_ig_map, ig_tag), spdk_json_decode_int32}, +}; + +static int +decode_rpc_pg_ig_map(const struct spdk_json_val *val, void *out) +{ + struct rpc_pg_ig_map *pg_ig_map = out; + + return spdk_json_decode_object(val, rpc_pg_ig_map_decoders, + SPDK_COUNTOF(rpc_pg_ig_map_decoders), + pg_ig_map); +} + +struct rpc_pg_ig_maps { + size_t num_maps; + struct rpc_pg_ig_map maps[MAX_TARGET_MAP]; +}; + +static int +decode_rpc_pg_ig_maps(const struct spdk_json_val *val, void *out) +{ + struct rpc_pg_ig_maps *pg_ig_maps = out; + + return spdk_json_decode_array(val, decode_rpc_pg_ig_map, pg_ig_maps->maps, + MAX_TARGET_MAP, &pg_ig_maps->num_maps, + sizeof(struct rpc_pg_ig_map)); +} + +#define RPC_CONSTRUCT_TARGET_NODE_MAX_LUN 64 + +struct rpc_lun { + char *bdev_name; + int32_t lun_id; +}; + +static const struct spdk_json_object_decoder rpc_lun_decoders[] = { + {"bdev_name", offsetof(struct rpc_lun, bdev_name), spdk_json_decode_string}, + {"lun_id", offsetof(struct rpc_lun, lun_id), spdk_json_decode_int32}, +}; + +static int +decode_rpc_lun(const struct spdk_json_val *val, void *out) +{ + struct rpc_lun *lun = out; + + return spdk_json_decode_object(val, rpc_lun_decoders, + SPDK_COUNTOF(rpc_lun_decoders), lun); +} + +struct rpc_luns { + size_t num_luns; + struct rpc_lun luns[RPC_CONSTRUCT_TARGET_NODE_MAX_LUN]; +}; + +static int +decode_rpc_luns(const struct spdk_json_val *val, void *out) +{ + struct rpc_luns *luns = out; + + return spdk_json_decode_array(val, decode_rpc_lun, luns->luns, + RPC_CONSTRUCT_TARGET_NODE_MAX_LUN, + &luns->num_luns, sizeof(struct rpc_lun)); +} + +static void +free_rpc_luns(struct rpc_luns *p) +{ + size_t i; + + for (i = 0; i < p->num_luns; i++) { + free(p->luns[i].bdev_name); + } +} + +struct rpc_target_node { + char *name; + char *alias_name; + + struct rpc_pg_ig_maps pg_ig_maps; + struct rpc_luns luns; + + int32_t queue_depth; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; + + bool header_digest; + bool data_digest; +}; + +static void +free_rpc_target_node(struct rpc_target_node *req) +{ + free(req->name); + free(req->alias_name); + free_rpc_luns(&req->luns); +} + +static const struct spdk_json_object_decoder rpc_target_node_decoders[] = { + {"name", offsetof(struct rpc_target_node, name), spdk_json_decode_string}, + {"alias_name", offsetof(struct rpc_target_node, alias_name), spdk_json_decode_string}, + {"pg_ig_maps", offsetof(struct rpc_target_node, pg_ig_maps), decode_rpc_pg_ig_maps}, + {"luns", offsetof(struct rpc_target_node, luns), decode_rpc_luns}, + {"queue_depth", offsetof(struct rpc_target_node, queue_depth), spdk_json_decode_int32}, + {"disable_chap", offsetof(struct rpc_target_node, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_target_node, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_target_node, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_target_node, chap_group), spdk_json_decode_int32, true}, + {"header_digest", offsetof(struct rpc_target_node, header_digest), spdk_json_decode_bool, true}, + {"data_digest", offsetof(struct rpc_target_node, data_digest), spdk_json_decode_bool, true}, +}; + +static void +spdk_rpc_construct_target_node(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_node req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + char *bdev_names[RPC_CONSTRUCT_TARGET_NODE_MAX_LUN] = {0}; + int32_t lun_ids[RPC_CONSTRUCT_TARGET_NODE_MAX_LUN] = {0}; + size_t i; + + if (spdk_json_decode_object(params, rpc_target_node_decoders, + SPDK_COUNTOF(rpc_target_node_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + for (i = 0; i < req.luns.num_luns; i++) { + bdev_names[i] = req.luns.luns[i].bdev_name; + lun_ids[i] = req.luns.luns[i].lun_id; + } + + /* + * Use default parameters in a few places: + * index = -1 : automatically pick an index for the new target node + * alias = NULL + */ + target = spdk_iscsi_tgt_node_construct(-1, req.name, req.alias_name, + pg_tags, + ig_tags, + req.pg_ig_maps.num_maps, + (const char **)bdev_names, + lun_ids, + req.luns.num_luns, + req.queue_depth, + req.disable_chap, + req.require_chap, + req.mutual_chap, + req.chap_group, + req.header_digest, + req.data_digest); + + if (target == NULL) { + goto invalid; + } + + free_rpc_target_node(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_target_node(&req); +} +SPDK_RPC_REGISTER("construct_target_node", spdk_rpc_construct_target_node, SPDK_RPC_RUNTIME) + +struct rpc_tgt_node_pg_ig_maps { + char *name; + struct rpc_pg_ig_maps pg_ig_maps; +}; + +static const struct spdk_json_object_decoder rpc_tgt_node_pg_ig_maps_decoders[] = { + {"name", offsetof(struct rpc_tgt_node_pg_ig_maps, name), spdk_json_decode_string}, + {"pg_ig_maps", offsetof(struct rpc_tgt_node_pg_ig_maps, pg_ig_maps), decode_rpc_pg_ig_maps}, +}; + +static void +spdk_rpc_add_pg_ig_maps(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tgt_node_pg_ig_maps req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + size_t i; + int rc; + + if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders, + SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = spdk_iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + rc = spdk_iscsi_tgt_node_add_pg_ig_maps(target, pg_tags, ig_tags, + req.pg_ig_maps.num_maps); + if (rc < 0) { + SPDK_ERRLOG("add pg-ig maps failed\n"); + goto invalid; + } + + free(req.name); + + w = spdk_jsonrpc_begin_result(request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free(req.name); +} +SPDK_RPC_REGISTER("add_pg_ig_maps", spdk_rpc_add_pg_ig_maps, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_delete_pg_ig_maps(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_tgt_node_pg_ig_maps req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int32_t pg_tags[MAX_TARGET_MAP] = {0}, ig_tags[MAX_TARGET_MAP] = {0}; + size_t i; + int rc; + + if (spdk_json_decode_object(params, rpc_tgt_node_pg_ig_maps_decoders, + SPDK_COUNTOF(rpc_tgt_node_pg_ig_maps_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = spdk_iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + for (i = 0; i < req.pg_ig_maps.num_maps; i++) { + pg_tags[i] = req.pg_ig_maps.maps[i].pg_tag; + ig_tags[i] = req.pg_ig_maps.maps[i].ig_tag; + } + + rc = spdk_iscsi_tgt_node_delete_pg_ig_maps(target, pg_tags, ig_tags, + req.pg_ig_maps.num_maps); + if (rc < 0) { + SPDK_ERRLOG("remove pg-ig maps failed\n"); + goto invalid; + } + + free(req.name); + + w = spdk_jsonrpc_begin_result(request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free(req.name); +} +SPDK_RPC_REGISTER("delete_pg_ig_maps", spdk_rpc_delete_pg_ig_maps, SPDK_RPC_RUNTIME) + +struct rpc_delete_target_node { + char *name; +}; + +static void +free_rpc_delete_target_node(struct rpc_delete_target_node *r) +{ + free(r->name); +} + +static const struct spdk_json_object_decoder rpc_delete_target_node_decoders[] = { + {"name", offsetof(struct rpc_delete_target_node, name), spdk_json_decode_string}, +}; + +static void +spdk_rpc_delete_target_node(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_target_node req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_delete_target_node_decoders, + SPDK_COUNTOF(rpc_delete_target_node_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.name == NULL) { + SPDK_ERRLOG("missing name param\n"); + goto invalid; + } + + if (spdk_iscsi_shutdown_tgt_node_by_name(req.name)) { + SPDK_ERRLOG("shutdown_tgt_node_by_name failed\n"); + goto invalid; + } + + free_rpc_delete_target_node(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_delete_target_node(&req); +} +SPDK_RPC_REGISTER("delete_target_node", spdk_rpc_delete_target_node, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_portal_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_portal_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + spdk_iscsi_portal_grps_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_portal_groups", spdk_rpc_get_portal_groups, SPDK_RPC_RUNTIME) + +struct rpc_portal { + char *host; + char *port; + char *cpumask; +}; + +struct rpc_portal_list { + size_t num_portals; + struct rpc_portal portals[MAX_PORTAL]; +}; + +struct rpc_portal_group { + int32_t tag; + struct rpc_portal_list portal_list; +}; + +static void +free_rpc_portal(struct rpc_portal *portal) +{ + free(portal->host); + free(portal->port); + free(portal->cpumask); +} + +static void +free_rpc_portal_list(struct rpc_portal_list *pl) +{ + size_t i; + + for (i = 0; i < pl->num_portals; i++) { + free_rpc_portal(&pl->portals[i]); + } + pl->num_portals = 0; +} + +static void +free_rpc_portal_group(struct rpc_portal_group *pg) +{ + free_rpc_portal_list(&pg->portal_list); +} + +static const struct spdk_json_object_decoder rpc_portal_decoders[] = { + {"host", offsetof(struct rpc_portal, host), spdk_json_decode_string}, + {"port", offsetof(struct rpc_portal, port), spdk_json_decode_string}, + {"cpumask", offsetof(struct rpc_portal, cpumask), spdk_json_decode_string, true}, +}; + +static int +decode_rpc_portal(const struct spdk_json_val *val, void *out) +{ + struct rpc_portal *portal = out; + + return spdk_json_decode_object(val, rpc_portal_decoders, + SPDK_COUNTOF(rpc_portal_decoders), + portal); +} + +static int +decode_rpc_portal_list(const struct spdk_json_val *val, void *out) +{ + struct rpc_portal_list *list = out; + + return spdk_json_decode_array(val, decode_rpc_portal, list->portals, MAX_PORTAL, &list->num_portals, + sizeof(struct rpc_portal)); +} + +static const struct spdk_json_object_decoder rpc_portal_group_decoders[] = { + {"tag", offsetof(struct rpc_portal_group, tag), spdk_json_decode_int32}, + {"portals", offsetof(struct rpc_portal_group, portal_list), decode_rpc_portal_list}, +}; + +static void +spdk_rpc_add_portal_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_portal_group req = {}; + struct spdk_iscsi_portal_grp *pg = NULL; + struct spdk_iscsi_portal *portal; + struct spdk_json_write_ctx *w; + size_t i = 0; + int rc = -1; + + if (spdk_json_decode_object(params, rpc_portal_group_decoders, + SPDK_COUNTOF(rpc_portal_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto out; + } + + pg = spdk_iscsi_portal_grp_create(req.tag); + if (pg == NULL) { + SPDK_ERRLOG("portal_grp_create failed\n"); + goto out; + } + for (i = 0; i < req.portal_list.num_portals; i++) { + portal = spdk_iscsi_portal_create(req.portal_list.portals[i].host, + req.portal_list.portals[i].port, + req.portal_list.portals[i].cpumask); + if (portal == NULL) { + SPDK_ERRLOG("portal_create failed\n"); + goto out; + } + spdk_iscsi_portal_grp_add_portal(pg, portal); + } + + rc = spdk_iscsi_portal_grp_open(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_open failed\n"); + goto out; + } + + rc = spdk_iscsi_portal_grp_register(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_register failed\n"); + } + +out: + if (rc == 0) { + w = spdk_jsonrpc_begin_result(request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + } + } else { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + if (pg != NULL) { + spdk_iscsi_portal_grp_release(pg); + } + } + free_rpc_portal_group(&req); +} +SPDK_RPC_REGISTER("add_portal_group", spdk_rpc_add_portal_group, SPDK_RPC_RUNTIME) + +struct rpc_delete_portal_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_delete_portal_group_decoders[] = { + {"tag", offsetof(struct rpc_delete_portal_group, tag), spdk_json_decode_int32}, +}; + +static void +spdk_rpc_delete_portal_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_portal_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_portal_grp *pg; + + if (spdk_json_decode_object(params, rpc_delete_portal_group_decoders, + SPDK_COUNTOF(rpc_delete_portal_group_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + pg = spdk_iscsi_portal_grp_unregister(req.tag); + if (!pg) { + goto invalid; + } + + spdk_iscsi_tgt_node_delete_map(pg, NULL); + spdk_iscsi_portal_grp_release(pg); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} +SPDK_RPC_REGISTER("delete_portal_group", spdk_rpc_delete_portal_group, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_iscsi_connections(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_iscsi_conn *conns = g_conns_array; + int i; + uint16_t tsih; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_iscsi_connections requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + for (i = 0; i < MAX_ISCSI_CONNECTIONS; i++) { + struct spdk_iscsi_conn *c = &conns[i]; + + if (!c->is_valid) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, c->id); + + spdk_json_write_name(w, "cid"); + spdk_json_write_int32(w, c->cid); + + /* + * If we try to return data for a connection that has not + * logged in yet, the session will not be set. So in this + * case, return -1 for the tsih rather than segfaulting + * on the null c->sess. + */ + if (c->sess == NULL) { + tsih = -1; + } else { + tsih = c->sess->tsih; + } + spdk_json_write_name(w, "tsih"); + spdk_json_write_int32(w, tsih); + + spdk_json_write_name(w, "lcore_id"); + spdk_json_write_int32(w, c->lcore); + + spdk_json_write_name(w, "initiator_addr"); + spdk_json_write_string(w, c->initiator_addr); + + spdk_json_write_name(w, "target_addr"); + spdk_json_write_string(w, c->target_addr); + + spdk_json_write_name(w, "target_node_name"); + spdk_json_write_string(w, c->target_short_name); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_iscsi_connections", spdk_rpc_get_iscsi_connections, SPDK_RPC_RUNTIME) + +struct rpc_target_lun { + char *name; + char *bdev_name; + int32_t lun_id; +}; + +static void +free_rpc_target_lun(struct rpc_target_lun *req) +{ + free(req->name); + free(req->bdev_name); +} + +static const struct spdk_json_object_decoder rpc_target_lun_decoders[] = { + {"name", offsetof(struct rpc_target_lun, name), spdk_json_decode_string}, + {"bdev_name", offsetof(struct rpc_target_lun, bdev_name), spdk_json_decode_string}, + {"lun_id", offsetof(struct rpc_target_lun, lun_id), spdk_json_decode_int32, true}, +}; + +static void +spdk_rpc_target_node_add_lun(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_lun req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int rc; + + req.lun_id = -1; + + if (spdk_json_decode_object(params, rpc_target_lun_decoders, + SPDK_COUNTOF(rpc_target_lun_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + target = spdk_iscsi_find_tgt_node(req.name); + if (target == NULL) { + SPDK_ERRLOG("target is not found\n"); + goto invalid; + } + + rc = spdk_iscsi_tgt_node_add_lun(target, req.bdev_name, req.lun_id); + if (rc < 0) { + SPDK_ERRLOG("add lun failed\n"); + goto invalid; + } + + free_rpc_target_lun(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_target_lun(&req); +} +SPDK_RPC_REGISTER("target_node_add_lun", spdk_rpc_target_node_add_lun, SPDK_RPC_RUNTIME) + +struct rpc_target_auth { + char *name; + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; +}; + +static void +free_rpc_target_auth(struct rpc_target_auth *req) +{ + free(req->name); +} + +static const struct spdk_json_object_decoder rpc_target_auth_decoders[] = { + {"name", offsetof(struct rpc_target_auth, name), spdk_json_decode_string}, + {"disable_chap", offsetof(struct rpc_target_auth, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_target_auth, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_target_auth, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_target_auth, chap_group), spdk_json_decode_int32, true}, +}; + +static void +spdk_rpc_set_iscsi_target_node_auth(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_target_auth req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_tgt_node *target; + int rc; + + if (spdk_json_decode_object(params, rpc_target_auth_decoders, + SPDK_COUNTOF(rpc_target_auth_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + target = spdk_iscsi_find_tgt_node(req.name); + if (target == NULL) { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find target %s", req.name); + free_rpc_target_auth(&req); + return; + } + + rc = spdk_iscsi_tgt_node_set_chap_params(target, req.disable_chap, req.require_chap, + req.mutual_chap, req.chap_group); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid combination of auth params"); + free_rpc_target_auth(&req); + return; + } + + free_rpc_target_auth(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_iscsi_target_node_auth", spdk_rpc_set_iscsi_target_node_auth, + SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_iscsi_global_params(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_iscsi_global_params requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_iscsi_opts_info_json(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_iscsi_global_params", spdk_rpc_get_iscsi_global_params, SPDK_RPC_RUNTIME) + +struct rpc_discovery_auth { + bool disable_chap; + bool require_chap; + bool mutual_chap; + int32_t chap_group; +}; + +static const struct spdk_json_object_decoder rpc_discovery_auth_decoders[] = { + {"disable_chap", offsetof(struct rpc_discovery_auth, disable_chap), spdk_json_decode_bool, true}, + {"require_chap", offsetof(struct rpc_discovery_auth, require_chap), spdk_json_decode_bool, true}, + {"mutual_chap", offsetof(struct rpc_discovery_auth, mutual_chap), spdk_json_decode_bool, true}, + {"chap_group", offsetof(struct rpc_discovery_auth, chap_group), spdk_json_decode_int32, true}, +}; + +static void +spdk_rpc_set_iscsi_discovery_auth(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_discovery_auth req = {}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_discovery_auth_decoders, + SPDK_COUNTOF(rpc_discovery_auth_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + rc = spdk_iscsi_set_discovery_auth(req.disable_chap, req.require_chap, + req.mutual_chap, req.chap_group); + if (rc < 0) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid combination of CHAP params"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("set_iscsi_discovery_auth", spdk_rpc_set_iscsi_discovery_auth, SPDK_RPC_RUNTIME) + + +#define MAX_AUTH_SECRETS 64 + +struct rpc_auth_secret { + char *user; + char *secret; + char *muser; + char *msecret; +}; + +static void +free_rpc_auth_secret(struct rpc_auth_secret *_secret) +{ + free(_secret->user); + free(_secret->secret); + free(_secret->muser); + free(_secret->msecret); +} + +static const struct spdk_json_object_decoder rpc_auth_secret_decoders[] = { + {"user", offsetof(struct rpc_auth_secret, user), spdk_json_decode_string}, + {"secret", offsetof(struct rpc_auth_secret, secret), spdk_json_decode_string}, + {"muser", offsetof(struct rpc_auth_secret, muser), spdk_json_decode_string, true}, + {"msecret", offsetof(struct rpc_auth_secret, msecret), spdk_json_decode_string, true}, +}; + +static int +decode_rpc_auth_secret(const struct spdk_json_val *val, void *out) +{ + struct rpc_auth_secret *_secret = out; + + return spdk_json_decode_object(val, rpc_auth_secret_decoders, + SPDK_COUNTOF(rpc_auth_secret_decoders), _secret); +} + +struct rpc_auth_secrets { + size_t num_secret; + struct rpc_auth_secret secrets[MAX_AUTH_SECRETS]; +}; + +static void +free_rpc_auth_secrets(struct rpc_auth_secrets *secrets) +{ + size_t i; + + for (i = 0; i < secrets->num_secret; i++) { + free_rpc_auth_secret(&secrets->secrets[i]); + } +} + +static int +decode_rpc_auth_secrets(const struct spdk_json_val *val, void *out) +{ + struct rpc_auth_secrets *secrets = out; + + return spdk_json_decode_array(val, decode_rpc_auth_secret, secrets->secrets, + MAX_AUTH_SECRETS, &secrets->num_secret, + sizeof(struct rpc_auth_secret)); +} + +struct rpc_auth_group { + int32_t tag; + struct rpc_auth_secrets secrets; +}; + +static void +free_rpc_auth_group(struct rpc_auth_group *group) +{ + free_rpc_auth_secrets(&group->secrets); +} + +static const struct spdk_json_object_decoder rpc_auth_group_decoders[] = { + {"tag", offsetof(struct rpc_auth_group, tag), spdk_json_decode_int32}, + {"secrets", offsetof(struct rpc_auth_group, secrets), decode_rpc_auth_secrets, true}, +}; + +static void +spdk_rpc_add_iscsi_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_auth_group req = {}; + struct rpc_auth_secret *_secret; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group = NULL; + int rc; + size_t i; + + if (spdk_json_decode_object(params, rpc_auth_group_decoders, + SPDK_COUNTOF(rpc_auth_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_auth_group(&req); + return; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + rc = spdk_iscsi_add_auth_group(req.tag, &group); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_auth_group(&req); + return; + } + + for (i = 0; i < req.secrets.num_secret; i++) { + _secret = &req.secrets.secrets[i]; + rc = spdk_iscsi_auth_group_add_secret(group, _secret->user, _secret->secret, + _secret->muser, _secret->msecret); + if (rc != 0) { + spdk_iscsi_delete_auth_group(group); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add secret to auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_auth_group(&req); + return; + } + } + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + free_rpc_auth_group(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("add_iscsi_auth_group", spdk_rpc_add_iscsi_auth_group, SPDK_RPC_RUNTIME) + +struct rpc_delete_auth_group { + int32_t tag; +}; + +static const struct spdk_json_object_decoder rpc_delete_auth_group_decoders[] = { + {"tag", offsetof(struct rpc_delete_auth_group, tag), spdk_json_decode_int32}, +}; + +static void +spdk_rpc_delete_iscsi_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_auth_group req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + + if (spdk_json_decode_object(params, rpc_delete_auth_group_decoders, + SPDK_COUNTOF(rpc_delete_auth_group_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + group = spdk_iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + return; + } + + spdk_iscsi_delete_auth_group(group); + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("delete_iscsi_auth_group", spdk_rpc_delete_iscsi_auth_group, SPDK_RPC_RUNTIME) + +struct rpc_add_auth_secret { + int32_t tag; + char *user; + char *secret; + char *muser; + char *msecret; +}; + +static void +free_rpc_add_auth_secret(struct rpc_add_auth_secret *_secret) +{ + free(_secret->user); + free(_secret->secret); + free(_secret->muser); + free(_secret->msecret); +} + +static const struct spdk_json_object_decoder rpc_add_auth_secret_decoders[] = { + {"tag", offsetof(struct rpc_add_auth_secret, tag), spdk_json_decode_int32}, + {"user", offsetof(struct rpc_add_auth_secret, user), spdk_json_decode_string}, + {"secret", offsetof(struct rpc_add_auth_secret, secret), spdk_json_decode_string}, + {"muser", offsetof(struct rpc_add_auth_secret, muser), spdk_json_decode_string, true}, + {"msecret", offsetof(struct rpc_add_auth_secret, msecret), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_add_secret_to_iscsi_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_auth_secret req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + int rc; + + if (spdk_json_decode_object(params, rpc_add_auth_secret_decoders, + SPDK_COUNTOF(rpc_add_auth_secret_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_add_auth_secret(&req); + return; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + group = spdk_iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + free_rpc_add_auth_secret(&req); + return; + } + + rc = spdk_iscsi_auth_group_add_secret(group, req.user, req.secret, req.muser, req.msecret); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not add secret to auth group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_add_auth_secret(&req); + return; + } + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + free_rpc_add_auth_secret(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("add_secret_to_iscsi_auth_group", spdk_rpc_add_secret_to_iscsi_auth_group, + SPDK_RPC_RUNTIME) + +struct rpc_delete_auth_secret { + int32_t tag; + char *user; +}; + +static void +free_rpc_delete_auth_secret(struct rpc_delete_auth_secret *_secret) +{ + free(_secret->user); +} + +static const struct spdk_json_object_decoder rpc_delete_auth_secret_decoders[] = { + {"tag", offsetof(struct rpc_delete_auth_secret, tag), spdk_json_decode_int32}, + {"user", offsetof(struct rpc_delete_auth_secret, user), spdk_json_decode_string}, +}; + +static void +spdk_rpc_delete_secret_from_iscsi_auth_group(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_delete_auth_secret req = {}; + struct spdk_json_write_ctx *w; + struct spdk_iscsi_auth_group *group; + int rc; + + if (spdk_json_decode_object(params, rpc_delete_auth_secret_decoders, + SPDK_COUNTOF(rpc_delete_auth_secret_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + free_rpc_delete_auth_secret(&req); + return; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + group = spdk_iscsi_find_auth_group_by_tag(req.tag); + if (group == NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not find auth group (%d)", req.tag); + free_rpc_delete_auth_secret(&req); + return; + } + + rc = spdk_iscsi_auth_group_delete_secret(group, req.user); + if (rc != 0) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Could not delete secret from CHAP group (%d), %s", + req.tag, spdk_strerror(-rc)); + free_rpc_delete_auth_secret(&req); + return; + } + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + free_rpc_delete_auth_secret(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("delete_secret_from_iscsi_auth_group", + spdk_rpc_delete_secret_from_iscsi_auth_group, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_iscsi_auth_groups(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_iscsi_auth_groups requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + spdk_iscsi_auth_groups_info_json(w); + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_iscsi_auth_groups", spdk_rpc_get_iscsi_auth_groups, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/iscsi/iscsi_subsystem.c b/src/spdk/lib/iscsi/iscsi_subsystem.c new file mode 100644 index 00000000..6cfa4f93 --- /dev/null +++ b/src/spdk/lib/iscsi/iscsi_subsystem.c @@ -0,0 +1,1523 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/sock.h" +#include "spdk/likely.h" + +#include "iscsi/iscsi.h" +#include "iscsi/init_grp.h" +#include "iscsi/portal_grp.h" +#include "iscsi/conn.h" +#include "iscsi/task.h" + +#include "spdk_internal/event.h" +#include "spdk_internal/log.h" + +struct spdk_iscsi_opts *g_spdk_iscsi_opts = NULL; + +static spdk_iscsi_init_cb g_init_cb_fn = NULL; +static void *g_init_cb_arg = NULL; + +static spdk_iscsi_fini_cb g_fini_cb_fn; +static void *g_fini_cb_arg; + +#define ISCSI_CONFIG_TMPL \ +"[iSCSI]\n" \ +" # node name (not include optional part)\n" \ +" # Users can optionally change this to fit their environment.\n" \ +" NodeBase \"%s\"\n" \ +"\n" \ +" # files\n" \ +" %s %s\n" \ +"\n" \ +" # socket I/O timeout sec. (polling is infinity)\n" \ +" Timeout %d\n" \ +"\n" \ +" # authentication information for discovery session\n" \ +" DiscoveryAuthMethod %s\n" \ +" DiscoveryAuthGroup %s\n" \ +"\n" \ +" MaxSessions %d\n" \ +" MaxConnectionsPerSession %d\n" \ +" MaxConnections %d\n" \ +" MaxQueueDepth %d\n" \ +"\n" \ +" # iSCSI initial parameters negotiate with initiators\n" \ +" # NOTE: incorrect values might crash\n" \ +" DefaultTime2Wait %d\n" \ +" DefaultTime2Retain %d\n" \ +"\n" \ +" FirstBurstLength %d\n" \ +" ImmediateData %s\n" \ +" ErrorRecoveryLevel %d\n" \ +"\n" + +static void +spdk_iscsi_globals_config_text(FILE *fp) +{ + const char *authmethod = "None"; + char authgroup[32] = "None"; + + if (NULL == fp) { + return; + } + + if (g_spdk_iscsi.require_chap) { + authmethod = "CHAP"; + } else if (g_spdk_iscsi.mutual_chap) { + authmethod = "CHAP Mutual"; + } else if (!g_spdk_iscsi.disable_chap) { + authmethod = "Auto"; + } + + if (g_spdk_iscsi.chap_group) { + snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", g_spdk_iscsi.chap_group); + } + + fprintf(fp, ISCSI_CONFIG_TMPL, + g_spdk_iscsi.nodebase, + g_spdk_iscsi.authfile ? "AuthFile" : "", + g_spdk_iscsi.authfile ? g_spdk_iscsi.authfile : "", + g_spdk_iscsi.timeout, authmethod, authgroup, + g_spdk_iscsi.MaxSessions, g_spdk_iscsi.MaxConnectionsPerSession, + g_spdk_iscsi.MaxConnections, + g_spdk_iscsi.MaxQueueDepth, + g_spdk_iscsi.DefaultTime2Wait, g_spdk_iscsi.DefaultTime2Retain, + g_spdk_iscsi.FirstBurstLength, + (g_spdk_iscsi.ImmediateData) ? "Yes" : "No", + g_spdk_iscsi.ErrorRecoveryLevel); +} + +static void +spdk_mobj_ctor(struct spdk_mempool *mp, __attribute__((unused)) void *arg, + void *_m, __attribute__((unused)) unsigned i) +{ + struct spdk_mobj *m = _m; + uint64_t *phys_addr; + ptrdiff_t off; + + m->mp = mp; + m->buf = (uint8_t *)m + sizeof(struct spdk_mobj); + m->buf = (void *)((unsigned long)((uint8_t *)m->buf + 512) & ~511UL); + off = (uint64_t)(uint8_t *)m->buf - (uint64_t)(uint8_t *)m; + + /* + * we store the physical address in a 64bit unsigned integer + * right before the 512B aligned buffer area. + */ + phys_addr = (uint64_t *)m->buf - 1; + *phys_addr = spdk_vtophys(m) + off; +} + +#define NUM_PDU_PER_CONNECTION(iscsi) (2 * (iscsi->MaxQueueDepth + MAX_LARGE_DATAIN_PER_CONNECTION + 8)) +#define PDU_POOL_SIZE(iscsi) (iscsi->MaxConnections * NUM_PDU_PER_CONNECTION(iscsi)) +#define IMMEDIATE_DATA_POOL_SIZE(iscsi) (iscsi->MaxConnections * 128) +#define DATA_OUT_POOL_SIZE(iscsi) (iscsi->MaxConnections * MAX_DATA_OUT_PER_CONNECTION) + +static int spdk_iscsi_initialize_pdu_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_spdk_iscsi; + int imm_mobj_size = spdk_get_immediate_data_buffer_size() + + sizeof(struct spdk_mobj) + 512; + int dout_mobj_size = spdk_get_data_out_buffer_size() + + sizeof(struct spdk_mobj) + 512; + + /* create PDU pool */ + iscsi->pdu_pool = spdk_mempool_create("PDU_Pool", + PDU_POOL_SIZE(iscsi), + sizeof(struct spdk_iscsi_pdu), + 256, SPDK_ENV_SOCKET_ID_ANY); + if (!iscsi->pdu_pool) { + SPDK_ERRLOG("create PDU pool failed\n"); + return -1; + } + + iscsi->pdu_immediate_data_pool = spdk_mempool_create_ctor("PDU_immediate_data_Pool", + IMMEDIATE_DATA_POOL_SIZE(iscsi), + imm_mobj_size, 0, + spdk_env_get_socket_id(spdk_env_get_current_core()), + spdk_mobj_ctor, NULL); + if (!iscsi->pdu_immediate_data_pool) { + SPDK_ERRLOG("create PDU immediate data pool failed\n"); + return -1; + } + + iscsi->pdu_data_out_pool = spdk_mempool_create_ctor("PDU_data_out_Pool", + DATA_OUT_POOL_SIZE(iscsi), + dout_mobj_size, 256, + spdk_env_get_socket_id(spdk_env_get_current_core()), + spdk_mobj_ctor, NULL); + if (!iscsi->pdu_data_out_pool) { + SPDK_ERRLOG("create PDU data out pool failed\n"); + return -1; + } + + return 0; +} + +static void spdk_iscsi_sess_ctor(struct spdk_mempool *pool, void *arg, + void *session_buf, unsigned index) +{ + struct spdk_iscsi_globals *iscsi = arg; + struct spdk_iscsi_sess *sess = session_buf; + + iscsi->session[index] = sess; + + /* tsih 0 is reserved, so start tsih values at 1. */ + sess->tsih = index + 1; +} + +#define DEFAULT_TASK_POOL_SIZE 32768 + +static int +spdk_iscsi_initialize_task_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_spdk_iscsi; + + /* create scsi_task pool */ + iscsi->task_pool = spdk_mempool_create("SCSI_TASK_Pool", + DEFAULT_TASK_POOL_SIZE, + sizeof(struct spdk_iscsi_task), + 128, SPDK_ENV_SOCKET_ID_ANY); + if (!iscsi->task_pool) { + SPDK_ERRLOG("create task pool failed\n"); + return -1; + } + + return 0; +} + +#define SESSION_POOL_SIZE(iscsi) (iscsi->MaxSessions) +static int spdk_iscsi_initialize_session_pool(void) +{ + struct spdk_iscsi_globals *iscsi = &g_spdk_iscsi; + + iscsi->session_pool = spdk_mempool_create_ctor("Session_Pool", + SESSION_POOL_SIZE(iscsi), + sizeof(struct spdk_iscsi_sess), 0, + SPDK_ENV_SOCKET_ID_ANY, + spdk_iscsi_sess_ctor, iscsi); + if (!iscsi->session_pool) { + SPDK_ERRLOG("create session pool failed\n"); + return -1; + } + + return 0; +} + +static int +spdk_iscsi_initialize_all_pools(void) +{ + if (spdk_iscsi_initialize_pdu_pool() != 0) { + return -1; + } + + if (spdk_iscsi_initialize_session_pool() != 0) { + return -1; + } + + if (spdk_iscsi_initialize_task_pool() != 0) { + return -1; + } + + return 0; +} + +static void +spdk_iscsi_check_pool(struct spdk_mempool *pool, size_t count) +{ + if (spdk_mempool_count(pool) != count) { + SPDK_ERRLOG("spdk_mempool_count(%s) == %zu, should be %zu\n", + spdk_mempool_get_name(pool), spdk_mempool_count(pool), count); + } +} + +static void +spdk_iscsi_check_pools(void) +{ + struct spdk_iscsi_globals *iscsi = &g_spdk_iscsi; + + spdk_iscsi_check_pool(iscsi->pdu_pool, PDU_POOL_SIZE(iscsi)); + spdk_iscsi_check_pool(iscsi->session_pool, SESSION_POOL_SIZE(iscsi)); + spdk_iscsi_check_pool(iscsi->pdu_immediate_data_pool, IMMEDIATE_DATA_POOL_SIZE(iscsi)); + spdk_iscsi_check_pool(iscsi->pdu_data_out_pool, DATA_OUT_POOL_SIZE(iscsi)); + spdk_iscsi_check_pool(iscsi->task_pool, DEFAULT_TASK_POOL_SIZE); +} + +static void +spdk_iscsi_free_pools(void) +{ + struct spdk_iscsi_globals *iscsi = &g_spdk_iscsi; + + spdk_mempool_free(iscsi->pdu_pool); + spdk_mempool_free(iscsi->session_pool); + spdk_mempool_free(iscsi->pdu_immediate_data_pool); + spdk_mempool_free(iscsi->pdu_data_out_pool); + spdk_mempool_free(iscsi->task_pool); +} + +void spdk_put_pdu(struct spdk_iscsi_pdu *pdu) +{ + if (!pdu) { + return; + } + + pdu->ref--; + + if (pdu->ref < 0) { + SPDK_ERRLOG("Negative PDU refcount: %p\n", pdu); + pdu->ref = 0; + } + + if (pdu->ref == 0) { + if (pdu->mobj) { + spdk_mempool_put(pdu->mobj->mp, (void *)pdu->mobj); + } + + if (pdu->data && !pdu->data_from_mempool) { + free(pdu->data); + } + + spdk_mempool_put(g_spdk_iscsi.pdu_pool, (void *)pdu); + } +} + +struct spdk_iscsi_pdu *spdk_get_pdu(void) +{ + struct spdk_iscsi_pdu *pdu; + + pdu = spdk_mempool_get(g_spdk_iscsi.pdu_pool); + if (!pdu) { + SPDK_ERRLOG("Unable to get PDU\n"); + abort(); + } + + /* we do not want to zero out the last part of the structure reserved for AHS and sense data */ + memset(pdu, 0, offsetof(struct spdk_iscsi_pdu, ahs)); + pdu->ref = 1; + + return pdu; +} + +static void +spdk_iscsi_log_globals(void) +{ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthFile %s\n", + g_spdk_iscsi.authfile ? g_spdk_iscsi.authfile : "(none)"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NodeBase %s\n", g_spdk_iscsi.nodebase); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxSessions %d\n", g_spdk_iscsi.MaxSessions); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxConnectionsPerSession %d\n", + g_spdk_iscsi.MaxConnectionsPerSession); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MaxQueueDepth %d\n", g_spdk_iscsi.MaxQueueDepth); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Wait %d\n", + g_spdk_iscsi.DefaultTime2Wait); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "DefaultTime2Retain %d\n", + g_spdk_iscsi.DefaultTime2Retain); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "FirstBurstLength %d\n", + g_spdk_iscsi.FirstBurstLength); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ImmediateData %s\n", + g_spdk_iscsi.ImmediateData ? "Yes" : "No"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AllowDuplicateIsid %s\n", + g_spdk_iscsi.AllowDuplicateIsid ? "Yes" : "No"); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "ErrorRecoveryLevel %d\n", + g_spdk_iscsi.ErrorRecoveryLevel); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Timeout %d\n", g_spdk_iscsi.timeout); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "NopInInterval %d\n", + g_spdk_iscsi.nopininterval); + if (g_spdk_iscsi.disable_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod None\n"); + } else if (!g_spdk_iscsi.require_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthMethod %s %s\n", + g_spdk_iscsi.require_chap ? "CHAP" : "", + g_spdk_iscsi.mutual_chap ? "Mutual" : ""); + } + + if (g_spdk_iscsi.chap_group == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthGroup None\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "DiscoveryAuthGroup AuthGroup%d\n", + g_spdk_iscsi.chap_group); + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "MinConnectionsPerCore%d\n", + spdk_iscsi_conn_get_min_per_core()); +} + +static void +spdk_iscsi_opts_init(struct spdk_iscsi_opts *opts) +{ + opts->MaxSessions = DEFAULT_MAX_SESSIONS; + opts->MaxConnectionsPerSession = DEFAULT_MAX_CONNECTIONS_PER_SESSION; + opts->MaxQueueDepth = DEFAULT_MAX_QUEUE_DEPTH; + opts->DefaultTime2Wait = DEFAULT_DEFAULTTIME2WAIT; + opts->DefaultTime2Retain = DEFAULT_DEFAULTTIME2RETAIN; + opts->FirstBurstLength = DEFAULT_FIRSTBURSTLENGTH; + opts->ImmediateData = DEFAULT_IMMEDIATEDATA; + opts->AllowDuplicateIsid = false; + opts->ErrorRecoveryLevel = DEFAULT_ERRORRECOVERYLEVEL; + opts->timeout = DEFAULT_TIMEOUT; + opts->nopininterval = DEFAULT_NOPININTERVAL; + opts->disable_chap = false; + opts->require_chap = false; + opts->mutual_chap = false; + opts->chap_group = 0; + opts->authfile = NULL; + opts->nodebase = NULL; + opts->min_connections_per_core = DEFAULT_CONNECTIONS_PER_LCORE; +} + +struct spdk_iscsi_opts * +spdk_iscsi_opts_alloc(void) +{ + struct spdk_iscsi_opts *opts; + + opts = calloc(1, sizeof(*opts)); + if (!opts) { + SPDK_ERRLOG("calloc() failed for iscsi options\n"); + return NULL; + } + + spdk_iscsi_opts_init(opts); + + return opts; +} + +void +spdk_iscsi_opts_free(struct spdk_iscsi_opts *opts) +{ + free(opts->authfile); + free(opts->nodebase); + free(opts); +} + +/* Deep copy of spdk_iscsi_opts */ +struct spdk_iscsi_opts * +spdk_iscsi_opts_copy(struct spdk_iscsi_opts *src) +{ + struct spdk_iscsi_opts *dst; + + dst = calloc(1, sizeof(*dst)); + if (!dst) { + SPDK_ERRLOG("calloc() failed for iscsi options\n"); + return NULL; + } + + if (src->authfile) { + dst->authfile = strdup(src->authfile); + if (!dst->authfile) { + free(dst); + SPDK_ERRLOG("failed to strdup for auth file %s\n", src->authfile); + return NULL; + } + } + + if (src->nodebase) { + dst->nodebase = strdup(src->nodebase); + if (!dst->nodebase) { + free(dst->authfile); + free(dst); + SPDK_ERRLOG("failed to strdup for nodebase %s\n", src->nodebase); + return NULL; + } + } + + dst->MaxSessions = src->MaxSessions; + dst->MaxConnectionsPerSession = src->MaxConnectionsPerSession; + dst->MaxQueueDepth = src->MaxQueueDepth; + dst->DefaultTime2Wait = src->DefaultTime2Wait; + dst->DefaultTime2Retain = src->DefaultTime2Retain; + dst->FirstBurstLength = src->FirstBurstLength; + dst->ImmediateData = src->ImmediateData; + dst->AllowDuplicateIsid = src->AllowDuplicateIsid; + dst->ErrorRecoveryLevel = src->ErrorRecoveryLevel; + dst->timeout = src->timeout; + dst->nopininterval = src->nopininterval; + dst->disable_chap = src->disable_chap; + dst->require_chap = src->require_chap; + dst->mutual_chap = src->mutual_chap; + dst->chap_group = src->chap_group; + dst->min_connections_per_core = src->min_connections_per_core; + + return dst; +} + +static int +spdk_iscsi_read_config_file_params(struct spdk_conf_section *sp, + struct spdk_iscsi_opts *opts) +{ + const char *val; + int MaxSessions; + int MaxConnectionsPerSession; + int MaxQueueDepth; + int DefaultTime2Wait; + int DefaultTime2Retain; + int FirstBurstLength; + int ErrorRecoveryLevel; + int timeout; + int nopininterval; + int min_conn_per_core = 0; + const char *ag_tag; + int ag_tag_i; + int i; + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + val = spdk_conf_section_get_val(sp, "AuthFile"); + if (val != NULL) { + opts->authfile = strdup(val); + if (!opts->authfile) { + SPDK_ERRLOG("strdup() failed for AuthFile\n"); + return -ENOMEM; + } + } + + val = spdk_conf_section_get_val(sp, "NodeBase"); + if (val != NULL) { + opts->nodebase = strdup(val); + if (!opts->nodebase) { + free(opts->authfile); + SPDK_ERRLOG("strdup() failed for NodeBase\n"); + return -ENOMEM; + } + } + + MaxSessions = spdk_conf_section_get_intval(sp, "MaxSessions"); + if (MaxSessions >= 0) { + opts->MaxSessions = MaxSessions; + } + + MaxConnectionsPerSession = spdk_conf_section_get_intval(sp, "MaxConnectionsPerSession"); + if (MaxConnectionsPerSession >= 0) { + opts->MaxConnectionsPerSession = MaxConnectionsPerSession; + } + + MaxQueueDepth = spdk_conf_section_get_intval(sp, "MaxQueueDepth"); + if (MaxQueueDepth >= 0) { + opts->MaxQueueDepth = MaxQueueDepth; + } + + DefaultTime2Wait = spdk_conf_section_get_intval(sp, "DefaultTime2Wait"); + if (DefaultTime2Wait >= 0) { + opts->DefaultTime2Wait = DefaultTime2Wait; + } + + DefaultTime2Retain = spdk_conf_section_get_intval(sp, "DefaultTime2Retain"); + if (DefaultTime2Retain >= 0) { + opts->DefaultTime2Retain = DefaultTime2Retain; + } + + FirstBurstLength = spdk_conf_section_get_intval(sp, "FirstBurstLength"); + if (FirstBurstLength >= 0) { + opts->FirstBurstLength = FirstBurstLength; + } + + opts->ImmediateData = spdk_conf_section_get_boolval(sp, "ImmediateData", + opts->ImmediateData); + + /* This option is only for test. + * If AllowDuplicateIsid is enabled, it allows different connections carrying + * TSIH=0 login the target within the same session. + */ + opts->AllowDuplicateIsid = spdk_conf_section_get_boolval(sp, "AllowDuplicateIsid", + opts->AllowDuplicateIsid); + + ErrorRecoveryLevel = spdk_conf_section_get_intval(sp, "ErrorRecoveryLevel"); + if (ErrorRecoveryLevel >= 0) { + opts->ErrorRecoveryLevel = ErrorRecoveryLevel; + } + timeout = spdk_conf_section_get_intval(sp, "Timeout"); + if (timeout >= 0) { + opts->timeout = timeout; + } + nopininterval = spdk_conf_section_get_intval(sp, "NopInInterval"); + if (nopininterval >= 0) { + opts->nopininterval = nopininterval; + } + val = spdk_conf_section_get_val(sp, "DiscoveryAuthMethod"); + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "DiscoveryAuthMethod", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "CHAP") == 0) { + opts->require_chap = true; + } else if (strcasecmp(val, "Mutual") == 0) { + opts->require_chap = true; + opts->mutual_chap = true; + } else if (strcasecmp(val, "Auto") == 0) { + opts->disable_chap = false; + opts->require_chap = false; + opts->mutual_chap = false; + } else if (strcasecmp(val, "None") == 0) { + opts->disable_chap = true; + opts->require_chap = false; + opts->mutual_chap = false; + } else { + SPDK_ERRLOG("unknown CHAP mode %s\n", val); + } + } + if (opts->mutual_chap && !opts->require_chap) { + SPDK_ERRLOG("CHAP must set to be required when using mutual CHAP.\n"); + return -EINVAL; + } + } + val = spdk_conf_section_get_val(sp, "DiscoveryAuthGroup"); + if (val != NULL) { + ag_tag = val; + if (strcasecmp(ag_tag, "None") == 0) { + opts->chap_group = 0; + } else { + if (strncasecmp(ag_tag, "AuthGroup", + strlen("AuthGroup")) != 0 + || sscanf(ag_tag, "%*[^0-9]%d", &ag_tag_i) != 1 + || ag_tag_i == 0) { + SPDK_ERRLOG("invalid auth group %s, ignoring\n", ag_tag); + } else { + opts->chap_group = ag_tag_i; + } + } + } + min_conn_per_core = spdk_conf_section_get_intval(sp, "MinConnectionsPerCore"); + if (min_conn_per_core >= 0) { + opts->min_connections_per_core = min_conn_per_core; + } + + return 0; +} + +static int +spdk_iscsi_opts_verify(struct spdk_iscsi_opts *opts) +{ + if (!opts->nodebase) { + opts->nodebase = strdup(SPDK_ISCSI_DEFAULT_NODEBASE); + if (opts->nodebase == NULL) { + SPDK_ERRLOG("strdup() failed for default nodebase\n"); + return -ENOMEM; + } + } + + if (opts->MaxSessions == 0 || opts->MaxSessions > 65535) { + SPDK_ERRLOG("%d is invalid. MaxSessions must be more than 0 and no more than 65535\n", + opts->MaxSessions); + return -EINVAL; + } + + if (opts->MaxConnectionsPerSession == 0 || opts->MaxConnectionsPerSession > 65535) { + SPDK_ERRLOG("%d is invalid. MaxConnectionsPerSession must be more than 0 and no more than 65535\n", + opts->MaxConnectionsPerSession); + return -EINVAL; + } + + if (opts->MaxQueueDepth == 0 || opts->MaxQueueDepth > 256) { + SPDK_ERRLOG("%d is invalid. MaxQueueDepth must be more than 0 and no more than 256\n", + opts->MaxQueueDepth); + return -EINVAL; + } + + if (opts->DefaultTime2Wait > 3600) { + SPDK_ERRLOG("%d is invalid. DefaultTime2Wait must be no more than 3600\n", + opts->DefaultTime2Wait); + return -EINVAL; + } + + if (opts->DefaultTime2Retain > 3600) { + SPDK_ERRLOG("%d is invalid. DefaultTime2Retain must be no more than 3600\n", + opts->DefaultTime2Retain); + return -EINVAL; + } + + if (opts->FirstBurstLength >= SPDK_ISCSI_MIN_FIRST_BURST_LENGTH) { + if (opts->FirstBurstLength > SPDK_ISCSI_MAX_BURST_LENGTH) { + SPDK_ERRLOG("FirstBurstLength %d shall not exceed MaxBurstLength %d\n", + opts->FirstBurstLength, SPDK_ISCSI_MAX_BURST_LENGTH); + return -EINVAL; + } + } else { + SPDK_ERRLOG("FirstBurstLength %d shall be no less than %d\n", + opts->FirstBurstLength, SPDK_ISCSI_MIN_FIRST_BURST_LENGTH); + return -EINVAL; + } + + if (opts->ErrorRecoveryLevel > 2) { + SPDK_ERRLOG("ErrorRecoveryLevel %d is not supported.\n", opts->ErrorRecoveryLevel); + return -EINVAL; + } + + if (opts->timeout < 0) { + SPDK_ERRLOG("%d is invalid. timeout must not be less than 0\n", opts->timeout); + return -EINVAL; + } + + if (opts->nopininterval < 0 || opts->nopininterval > MAX_NOPININTERVAL) { + SPDK_ERRLOG("%d is invalid. nopinterval must be between 0 and %d\n", + opts->nopininterval, MAX_NOPININTERVAL); + return -EINVAL; + } + + if (!spdk_iscsi_check_chap_params(opts->disable_chap, opts->require_chap, + opts->mutual_chap, opts->chap_group)) { + SPDK_ERRLOG("CHAP params in opts are illegal combination\n"); + return -EINVAL; + } + + return 0; +} + +static int +spdk_iscsi_parse_options(struct spdk_iscsi_opts **popts) +{ + struct spdk_iscsi_opts *opts; + struct spdk_conf_section *sp; + int rc; + + opts = spdk_iscsi_opts_alloc(); + if (!opts) { + SPDK_ERRLOG("spdk_iscsi_opts_alloc_failed() failed\n"); + return -ENOMEM; + } + + /* Process parameters */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_read_config_file_parmas\n"); + sp = spdk_conf_find_section(NULL, "iSCSI"); + if (sp != NULL) { + rc = spdk_iscsi_read_config_file_params(sp, opts); + if (rc != 0) { + free(opts); + SPDK_ERRLOG("spdk_iscsi_read_config_file_params() failed\n"); + return rc; + } + } + + *popts = opts; + + return 0; +} + +static int +spdk_iscsi_set_global_params(struct spdk_iscsi_opts *opts) +{ + int rc; + + rc = spdk_iscsi_opts_verify(opts); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_opts_verify() failed\n"); + return rc; + } + + if (opts->authfile != NULL) { + g_spdk_iscsi.authfile = strdup(opts->authfile); + if (!g_spdk_iscsi.authfile) { + SPDK_ERRLOG("failed to strdup for auth file %s\n", opts->authfile); + return -ENOMEM; + } + } + + g_spdk_iscsi.nodebase = strdup(opts->nodebase); + if (!g_spdk_iscsi.nodebase) { + SPDK_ERRLOG("failed to strdup for nodebase %s\n", opts->nodebase); + return -ENOMEM; + } + + g_spdk_iscsi.MaxSessions = opts->MaxSessions; + g_spdk_iscsi.MaxConnectionsPerSession = opts->MaxConnectionsPerSession; + g_spdk_iscsi.MaxQueueDepth = opts->MaxQueueDepth; + g_spdk_iscsi.DefaultTime2Wait = opts->DefaultTime2Wait; + g_spdk_iscsi.DefaultTime2Retain = opts->DefaultTime2Retain; + g_spdk_iscsi.FirstBurstLength = opts->FirstBurstLength; + g_spdk_iscsi.ImmediateData = opts->ImmediateData; + g_spdk_iscsi.AllowDuplicateIsid = opts->AllowDuplicateIsid; + g_spdk_iscsi.ErrorRecoveryLevel = opts->ErrorRecoveryLevel; + g_spdk_iscsi.timeout = opts->timeout; + g_spdk_iscsi.nopininterval = opts->nopininterval; + g_spdk_iscsi.disable_chap = opts->disable_chap; + g_spdk_iscsi.require_chap = opts->require_chap; + g_spdk_iscsi.mutual_chap = opts->mutual_chap; + g_spdk_iscsi.chap_group = opts->chap_group; + + spdk_iscsi_conn_set_min_per_core(opts->min_connections_per_core); + + spdk_iscsi_log_globals(); + + return 0; +} + +int +spdk_iscsi_set_discovery_auth(bool disable_chap, bool require_chap, bool mutual_chap, + int32_t chap_group) +{ + if (!spdk_iscsi_check_chap_params(disable_chap, require_chap, mutual_chap, + chap_group)) { + SPDK_ERRLOG("CHAP params are illegal combination\n"); + return -EINVAL; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + g_spdk_iscsi.disable_chap = disable_chap; + g_spdk_iscsi.require_chap = require_chap; + g_spdk_iscsi.mutual_chap = mutual_chap; + g_spdk_iscsi.chap_group = chap_group; + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return 0; +} + +int +spdk_iscsi_auth_group_add_secret(struct spdk_iscsi_auth_group *group, + const char *user, const char *secret, + const char *muser, const char *msecret) +{ + struct spdk_iscsi_auth_secret *_secret; + size_t len; + + if (user == NULL || secret == NULL) { + SPDK_ERRLOG("user and secret must be specified\n"); + return -EINVAL; + } + + if (muser != NULL && msecret == NULL) { + SPDK_ERRLOG("msecret must be specified with muser\n"); + return -EINVAL; + } + + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, user) == 0) { + SPDK_ERRLOG("user for secret is duplicated\n"); + return -EEXIST; + } + } + + _secret = calloc(1, sizeof(*_secret)); + if (_secret == NULL) { + SPDK_ERRLOG("calloc() failed for CHAP secret\n"); + return -ENOMEM; + } + + len = strnlen(user, sizeof(_secret->user)); + if (len > sizeof(_secret->user) - 1) { + SPDK_ERRLOG("CHAP user longer than %zu characters: %s\n", + sizeof(_secret->user) - 1, user); + free(_secret); + return -EINVAL; + } + memcpy(_secret->user, user, len); + + len = strnlen(secret, sizeof(_secret->secret)); + if (len > sizeof(_secret->secret) - 1) { + SPDK_ERRLOG("CHAP secret longer than %zu characters: %s\n", + sizeof(_secret->secret) - 1, secret); + free(_secret); + return -EINVAL; + } + memcpy(_secret->secret, secret, len); + + if (muser != NULL) { + len = strnlen(muser, sizeof(_secret->muser)); + if (len > sizeof(_secret->muser) - 1) { + SPDK_ERRLOG("Mutual CHAP user longer than %zu characters: %s\n", + sizeof(_secret->muser) - 1, muser); + free(_secret); + return -EINVAL; + } + memcpy(_secret->muser, muser, len); + + len = strnlen(msecret, sizeof(_secret->msecret)); + if (len > sizeof(_secret->msecret) - 1) { + SPDK_ERRLOG("Mutual CHAP secret longer than %zu characters: %s\n", + sizeof(_secret->msecret) - 1, msecret); + free(_secret); + return -EINVAL; + } + memcpy(_secret->msecret, msecret, len); + } + + TAILQ_INSERT_TAIL(&group->secret_head, _secret, tailq); + return 0; +} + +int +spdk_iscsi_auth_group_delete_secret(struct spdk_iscsi_auth_group *group, + const char *user) +{ + struct spdk_iscsi_auth_secret *_secret; + + if (user == NULL) { + SPDK_ERRLOG("user must be specified\n"); + return -EINVAL; + } + + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, user) == 0) { + break; + } + } + + if (_secret == NULL) { + SPDK_ERRLOG("secret is not found\n"); + return -ENODEV; + } + + TAILQ_REMOVE(&group->secret_head, _secret, tailq); + free(_secret); + + return 0; +} + +int +spdk_iscsi_add_auth_group(int32_t tag, struct spdk_iscsi_auth_group **_group) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_spdk_iscsi.auth_group_head, tailq) { + if (group->tag == tag) { + SPDK_ERRLOG("Auth group (%d) already exists\n", tag); + return -EEXIST; + } + } + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + SPDK_ERRLOG("calloc() failed for auth group\n"); + return -ENOMEM; + } + + TAILQ_INIT(&group->secret_head); + group->tag = tag; + + TAILQ_INSERT_TAIL(&g_spdk_iscsi.auth_group_head, group, tailq); + + *_group = group; + return 0; +} + +void +spdk_iscsi_delete_auth_group(struct spdk_iscsi_auth_group *group) +{ + struct spdk_iscsi_auth_secret *_secret, *tmp; + + TAILQ_REMOVE(&g_spdk_iscsi.auth_group_head, group, tailq); + + TAILQ_FOREACH_SAFE(_secret, &group->secret_head, tailq, tmp) { + TAILQ_REMOVE(&group->secret_head, _secret, tailq); + free(_secret); + } + free(group); +} + +struct spdk_iscsi_auth_group * +spdk_iscsi_find_auth_group_by_tag(int32_t tag) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_spdk_iscsi.auth_group_head, tailq) { + if (group->tag == tag) { + return group; + } + } + + return NULL; +} + +static void +spdk_iscsi_auth_groups_destroy(void) +{ + struct spdk_iscsi_auth_group *group, *tmp; + + TAILQ_FOREACH_SAFE(group, &g_spdk_iscsi.auth_group_head, tailq, tmp) { + spdk_iscsi_delete_auth_group(group); + } +} + +static int +spdk_iscsi_parse_auth_group(struct spdk_conf_section *sp) +{ + int rc; + int i; + int tag; + const char *val, *user, *secret, *muser, *msecret; + struct spdk_iscsi_auth_group *group = NULL; + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + tag = spdk_conf_section_get_num(sp); + + rc = spdk_iscsi_add_auth_group(tag, &group); + if (rc != 0) { + SPDK_ERRLOG("Failed to add auth group\n"); + return rc; + } + + for (i = 0; ; i++) { + val = spdk_conf_section_get_nval(sp, "Auth", i); + if (val == NULL) { + break; + } + + user = spdk_conf_section_get_nmval(sp, "Auth", i, 0); + secret = spdk_conf_section_get_nmval(sp, "Auth", i, 1); + muser = spdk_conf_section_get_nmval(sp, "Auth", i, 2); + msecret = spdk_conf_section_get_nmval(sp, "Auth", i, 3); + + rc = spdk_iscsi_auth_group_add_secret(group, user, secret, muser, msecret); + if (rc != 0) { + SPDK_ERRLOG("Failed to add secret to auth group\n"); + spdk_iscsi_delete_auth_group(group); + return rc; + } + } + + return 0; +} + +static int +spdk_iscsi_parse_auth_info(void) +{ + struct spdk_conf *config; + struct spdk_conf_section *sp; + int rc; + + config = spdk_conf_allocate(); + if (!config) { + SPDK_ERRLOG("Failed to allocate config file\n"); + return -ENOMEM; + } + + rc = spdk_conf_read(config, g_spdk_iscsi.authfile); + if (rc != 0) { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "Failed to load auth file\n"); + spdk_conf_free(config); + return rc; + } + + sp = spdk_conf_first_section(config); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "AuthGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + spdk_iscsi_auth_groups_destroy(); + spdk_conf_free(config); + return -EINVAL; + } + + rc = spdk_iscsi_parse_auth_group(sp); + if (rc != 0) { + SPDK_ERRLOG("parse_auth_group() failed\n"); + spdk_iscsi_auth_groups_destroy(); + spdk_conf_free(config); + return rc; + } + } + sp = spdk_conf_next_section(sp); + } + + spdk_conf_free(config); + return 0; +} + +static struct spdk_iscsi_auth_secret * +spdk_iscsi_find_auth_secret(const char *authuser, int ag_tag) +{ + struct spdk_iscsi_auth_group *group; + struct spdk_iscsi_auth_secret *_secret; + + TAILQ_FOREACH(group, &g_spdk_iscsi.auth_group_head, tailq) { + if (group->tag == ag_tag) { + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + if (strcmp(_secret->user, authuser) == 0) { + return _secret; + } + } + } + } + + return NULL; +} + +int +spdk_iscsi_chap_get_authinfo(struct iscsi_chap_auth *auth, const char *authuser, + int ag_tag) +{ + struct spdk_iscsi_auth_secret *_secret; + + if (authuser == NULL) { + return -EINVAL; + } + + if (auth->user[0] != '\0') { + memset(auth->user, 0, sizeof(auth->user)); + memset(auth->secret, 0, sizeof(auth->secret)); + memset(auth->muser, 0, sizeof(auth->muser)); + memset(auth->msecret, 0, sizeof(auth->msecret)); + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + _secret = spdk_iscsi_find_auth_secret(authuser, ag_tag); + if (_secret == NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + SPDK_ERRLOG("CHAP secret is not found: user:%s, tag:%d\n", + authuser, ag_tag); + return -ENOENT; + } + + memcpy(auth->user, _secret->user, sizeof(auth->user)); + memcpy(auth->secret, _secret->secret, sizeof(auth->secret)); + + if (_secret->muser[0] != '\0') { + memcpy(auth->muser, _secret->muser, sizeof(auth->muser)); + memcpy(auth->msecret, _secret->msecret, sizeof(auth->msecret)); + } + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return 0; +} + +static int +spdk_iscsi_initialize_global_params(void) +{ + int rc; + + if (!g_spdk_iscsi_opts) { + rc = spdk_iscsi_parse_options(&g_spdk_iscsi_opts); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_parse_options() failed\n"); + return rc; + } + } + + rc = spdk_iscsi_set_global_params(g_spdk_iscsi_opts); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_set_global_params() failed\n"); + } + + spdk_iscsi_opts_free(g_spdk_iscsi_opts); + g_spdk_iscsi_opts = NULL; + + return rc; +} + +static void +spdk_iscsi_init_complete(int rc) +{ + spdk_iscsi_init_cb cb_fn = g_init_cb_fn; + void *cb_arg = g_init_cb_arg; + + g_init_cb_fn = NULL; + g_init_cb_arg = NULL; + + cb_fn(cb_arg, rc); +} + +static int +spdk_iscsi_poll_group_poll(void *ctx) +{ + struct spdk_iscsi_poll_group *group = ctx; + struct spdk_iscsi_conn *conn, *tmp; + int rc; + + if (spdk_unlikely(STAILQ_EMPTY(&group->connections))) { + return 0; + } + + rc = spdk_sock_group_poll(group->sock_group); + if (rc < 0) { + SPDK_ERRLOG("Failed to poll sock_group=%p\n", group->sock_group); + } + + STAILQ_FOREACH_SAFE(conn, &group->connections, link, tmp) { + if (conn->state == ISCSI_CONN_STATE_EXITING) { + spdk_iscsi_conn_destruct(conn); + } + } + + return -1; +} + +static int +spdk_iscsi_poll_group_handle_nop(void *ctx) +{ + struct spdk_iscsi_poll_group *group = ctx; + struct spdk_iscsi_conn *conn, *tmp; + + STAILQ_FOREACH_SAFE(conn, &group->connections, link, tmp) { + spdk_iscsi_conn_handle_nop(conn); + } + + return -1; +} + +static void +iscsi_create_poll_group(void *ctx) +{ + struct spdk_iscsi_poll_group *pg; + + assert(g_spdk_iscsi.poll_group != NULL); + pg = &g_spdk_iscsi.poll_group[spdk_env_get_current_core()]; + pg->core = spdk_env_get_current_core(); + + STAILQ_INIT(&pg->connections); + pg->sock_group = spdk_sock_group_create(); + assert(pg->sock_group != NULL); + + pg->poller = spdk_poller_register(spdk_iscsi_poll_group_poll, pg, 0); + /* set the period to 1 sec */ + pg->nop_poller = spdk_poller_register(spdk_iscsi_poll_group_handle_nop, pg, 1000000); +} + +static void +iscsi_unregister_poll_group(void *ctx) +{ + struct spdk_iscsi_poll_group *pg; + + assert(g_spdk_iscsi.poll_group != NULL); + pg = &g_spdk_iscsi.poll_group[spdk_env_get_current_core()]; + assert(pg->poller != NULL); + assert(pg->sock_group != NULL); + + spdk_sock_group_close(&pg->sock_group); + spdk_poller_unregister(&pg->poller); + spdk_poller_unregister(&pg->nop_poller); +} + +static void +spdk_initialize_iscsi_poll_group(spdk_thread_fn cpl) +{ + size_t g_num_poll_groups = spdk_env_get_last_core() + 1; + + g_spdk_iscsi.poll_group = calloc(g_num_poll_groups, sizeof(struct spdk_iscsi_poll_group)); + if (!g_spdk_iscsi.poll_group) { + SPDK_ERRLOG("Failed to allocated iscsi poll group\n"); + spdk_iscsi_init_complete(-1); + return; + } + + /* Send a message to each thread and create a poll group */ + spdk_for_each_thread(iscsi_create_poll_group, NULL, cpl); +} + +static void +spdk_iscsi_parse_configuration(void *ctx) +{ + int rc; + + rc = spdk_iscsi_parse_portal_grps(); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_portal_grps() failed\n"); + goto end; + } + + rc = spdk_iscsi_parse_init_grps(); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_init_grps() failed\n"); + goto end; + } + + rc = spdk_iscsi_parse_tgt_nodes(); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_tgt_nodes() failed\n"); + } + + if (g_spdk_iscsi.authfile != NULL) { + if (access(g_spdk_iscsi.authfile, R_OK) == 0) { + rc = spdk_iscsi_parse_auth_info(); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_auth_info() failed\n"); + } + } else { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "CHAP secret file is not found in the path %s\n", + g_spdk_iscsi.authfile); + } + } + +end: + spdk_iscsi_init_complete(rc); +} + +static int +spdk_iscsi_parse_globals(void) +{ + int rc; + + rc = spdk_iscsi_initialize_global_params(); + if (rc != 0) { + SPDK_ERRLOG("spdk_iscsi_initialize_iscsi_global_params() failed\n"); + return rc; + } + + g_spdk_iscsi.session = spdk_dma_zmalloc(sizeof(void *) * g_spdk_iscsi.MaxSessions, 0, NULL); + if (!g_spdk_iscsi.session) { + SPDK_ERRLOG("spdk_dma_zmalloc() failed for session array\n"); + return -1; + } + + /* + * For now, just support same number of total connections, rather + * than MaxSessions * MaxConnectionsPerSession. After we add better + * handling for low resource conditions from our various buffer + * pools, we can bump this up to support more connections. + */ + g_spdk_iscsi.MaxConnections = g_spdk_iscsi.MaxSessions; + + rc = spdk_iscsi_initialize_all_pools(); + if (rc != 0) { + SPDK_ERRLOG("spdk_initialize_all_pools() failed\n"); + return -1; + } + + rc = spdk_initialize_iscsi_conns(); + if (rc < 0) { + SPDK_ERRLOG("spdk_initialize_iscsi_conns() failed\n"); + return rc; + } + + spdk_initialize_iscsi_poll_group(spdk_iscsi_parse_configuration); + return 0; +} + +void +spdk_iscsi_init(spdk_iscsi_init_cb cb_fn, void *cb_arg) +{ + int rc; + + assert(cb_fn != NULL); + g_init_cb_fn = cb_fn; + g_init_cb_arg = cb_arg; + + rc = spdk_iscsi_parse_globals(); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_globals() failed\n"); + spdk_iscsi_init_complete(-1); + } + + /* + * spdk_iscsi_parse_configuration() will be called as the callback to + * spdk_initialize_iscsi_poll_group() and will complete iSCSI + * subsystem initialization. + */ +} + +void +spdk_iscsi_fini(spdk_iscsi_fini_cb cb_fn, void *cb_arg) +{ + g_fini_cb_fn = cb_fn; + g_fini_cb_arg = cb_arg; + + spdk_iscsi_portal_grp_close_all(); + spdk_shutdown_iscsi_conns(); +} + +static void +spdk_iscsi_fini_done(void *arg) +{ + spdk_iscsi_check_pools(); + spdk_iscsi_free_pools(); + + spdk_iscsi_shutdown_tgt_nodes(); + spdk_iscsi_init_grps_destroy(); + spdk_iscsi_portal_grps_destroy(); + spdk_iscsi_auth_groups_destroy(); + free(g_spdk_iscsi.authfile); + free(g_spdk_iscsi.nodebase); + free(g_spdk_iscsi.poll_group); + + pthread_mutex_destroy(&g_spdk_iscsi.mutex); + g_fini_cb_fn(g_fini_cb_arg); +} + +void +spdk_shutdown_iscsi_conns_done(void) +{ + if (g_spdk_iscsi.poll_group) { + spdk_for_each_thread(iscsi_unregister_poll_group, NULL, spdk_iscsi_fini_done); + } else { + spdk_iscsi_fini_done(NULL); + } +} + +void +spdk_iscsi_config_text(FILE *fp) +{ + spdk_iscsi_globals_config_text(fp); + spdk_iscsi_portal_grps_config_text(fp); + spdk_iscsi_init_grps_config_text(fp); + spdk_iscsi_tgt_nodes_config_text(fp); +} + +void +spdk_iscsi_opts_info_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + if (g_spdk_iscsi.authfile != NULL) { + spdk_json_write_named_string(w, "auth_file", g_spdk_iscsi.authfile); + } + spdk_json_write_named_string(w, "node_base", g_spdk_iscsi.nodebase); + + spdk_json_write_named_uint32(w, "max_sessions", g_spdk_iscsi.MaxSessions); + spdk_json_write_named_uint32(w, "max_connections_per_session", + g_spdk_iscsi.MaxConnectionsPerSession); + + spdk_json_write_named_uint32(w, "max_queue_depth", g_spdk_iscsi.MaxQueueDepth); + + spdk_json_write_named_uint32(w, "default_time2wait", g_spdk_iscsi.DefaultTime2Wait); + spdk_json_write_named_uint32(w, "default_time2retain", g_spdk_iscsi.DefaultTime2Retain); + + spdk_json_write_named_uint32(w, "first_burst_length", g_spdk_iscsi.FirstBurstLength); + + spdk_json_write_named_bool(w, "immediate_data", g_spdk_iscsi.ImmediateData); + + spdk_json_write_named_bool(w, "allow_duplicated_isid", g_spdk_iscsi.AllowDuplicateIsid); + + spdk_json_write_named_uint32(w, "error_recovery_level", g_spdk_iscsi.ErrorRecoveryLevel); + + spdk_json_write_named_int32(w, "nop_timeout", g_spdk_iscsi.timeout); + spdk_json_write_named_int32(w, "nop_in_interval", g_spdk_iscsi.nopininterval); + + spdk_json_write_named_bool(w, "disable_chap", g_spdk_iscsi.disable_chap); + spdk_json_write_named_bool(w, "require_chap", g_spdk_iscsi.require_chap); + spdk_json_write_named_bool(w, "mutual_chap", g_spdk_iscsi.mutual_chap); + spdk_json_write_named_int32(w, "chap_group", g_spdk_iscsi.chap_group); + + spdk_json_write_named_uint32(w, "min_connections_per_core", + spdk_iscsi_conn_get_min_per_core()); + + spdk_json_write_object_end(w); +} + +static void +spdk_iscsi_auth_group_info_json(struct spdk_iscsi_auth_group *group, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_secret *_secret; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", group->tag); + + spdk_json_write_named_array_begin(w, "secrets"); + TAILQ_FOREACH(_secret, &group->secret_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "user", _secret->user); + spdk_json_write_named_string(w, "secret", _secret->secret); + + if (_secret->muser[0] != '\0') { + spdk_json_write_named_string(w, "muser", _secret->muser); + spdk_json_write_named_string(w, "msecret", _secret->msecret); + } + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +spdk_iscsi_auth_group_config_json(struct spdk_iscsi_auth_group *group, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "add_iscsi_auth_group"); + + spdk_json_write_name(w, "params"); + spdk_iscsi_auth_group_info_json(group, w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_auth_groups_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_spdk_iscsi.auth_group_head, tailq) { + spdk_iscsi_auth_group_info_json(group, w); + } +} + +static void +spdk_iscsi_auth_groups_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_auth_group *group; + + TAILQ_FOREACH(group, &g_spdk_iscsi.auth_group_head, tailq) { + spdk_iscsi_auth_group_config_json(group, w); + } +} + +static void +spdk_iscsi_opts_config_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "set_iscsi_options"); + + spdk_json_write_name(w, "params"); + spdk_iscsi_opts_info_json(w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_config_json(struct spdk_json_write_ctx *w) +{ + spdk_json_write_array_begin(w); + spdk_iscsi_opts_config_json(w); + spdk_iscsi_portal_grps_config_json(w); + spdk_iscsi_init_grps_config_json(w); + spdk_iscsi_tgt_nodes_config_json(w); + spdk_iscsi_auth_groups_config_json(w); + spdk_json_write_array_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("iscsi", SPDK_LOG_ISCSI) diff --git a/src/spdk/lib/iscsi/md5.c b/src/spdk/lib/iscsi/md5.c new file mode 100644 index 00000000..2b3291e4 --- /dev/null +++ b/src/spdk/lib/iscsi/md5.c @@ -0,0 +1,75 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include + +#include "iscsi/md5.h" + +int spdk_md5init(struct spdk_md5ctx *md5ctx) +{ + int rc; + + if (md5ctx == NULL) { + return -1; + } + rc = MD5_Init(&md5ctx->md5ctx); + return rc; +} + +int spdk_md5final(void *md5, struct spdk_md5ctx *md5ctx) +{ + int rc; + + if (md5ctx == NULL || md5 == NULL) { + return -1; + } + rc = MD5_Final(md5, &md5ctx->md5ctx); + return rc; +} + +int spdk_md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len) +{ + int rc; + + if (md5ctx == NULL) { + return -1; + } + if (data == NULL || len == 0) { + return 0; + } + rc = MD5_Update(&md5ctx->md5ctx, data, len); + return rc; +} diff --git a/src/spdk/lib/iscsi/md5.h b/src/spdk/lib/iscsi/md5.h new file mode 100644 index 00000000..ff571b4a --- /dev/null +++ b/src/spdk/lib/iscsi/md5.h @@ -0,0 +1,52 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_MD5_H +#define SPDK_MD5_H + +#include "spdk/stdinc.h" + +#include + +#define SPDK_MD5DIGEST_LEN MD5_DIGEST_LENGTH + +struct spdk_md5ctx { + MD5_CTX md5ctx; +}; + +int spdk_md5init(struct spdk_md5ctx *md5ctx); +int spdk_md5final(void *md5, struct spdk_md5ctx *md5ctx); +int spdk_md5update(struct spdk_md5ctx *md5ctx, const void *data, size_t len); + +#endif /* SPDK_MD5_H */ diff --git a/src/spdk/lib/iscsi/param.c b/src/spdk/lib/iscsi/param.c new file mode 100644 index 00000000..e09bf899 --- /dev/null +++ b/src/spdk/lib/iscsi/param.c @@ -0,0 +1,1182 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" +#include "iscsi/iscsi.h" +#include "iscsi/param.h" +#include "iscsi/conn.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#define MAX_TMPBUF 1024 + +/* whose value may be bigger than 255 */ +static const char *non_simple_value_params[] = { + "CHAP_C", + "CHAP_R", + NULL, +}; + +void +spdk_iscsi_param_free(struct iscsi_param *params) +{ + struct iscsi_param *param, *next_param; + + if (params == NULL) { + return; + } + for (param = params; param != NULL; param = next_param) { + next_param = param->next; + if (param->list) { + free(param->list); + } + free(param->val); + free(param->key); + free(param); + } +} + +static int +spdk_iscsi_find_key_in_array(const char *key, const char *array[]) +{ + int i; + + for (i = 0; array[i] != NULL; i++) { + if (strcasecmp(key, array[i]) == 0) { + return 1; + } + } + return 0; +} + +struct iscsi_param * +spdk_iscsi_param_find(struct iscsi_param *params, const char *key) +{ + struct iscsi_param *param; + + if (params == NULL || key == NULL) { + return NULL; + } + for (param = params; param != NULL; param = param->next) { + if (param->key != NULL && param->key[0] == key[0] + && strcasecmp(param->key, key) == 0) { + return param; + } + } + return NULL; +} + +int +spdk_iscsi_param_del(struct iscsi_param **params, const char *key) +{ + struct iscsi_param *param, *prev_param = NULL; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "del %s\n", key); + if (params == NULL || key == NULL) { + return 0; + } + for (param = *params; param != NULL; param = param->next) { + if (param->key != NULL && param->key[0] == key[0] + && strcasecmp(param->key, key) == 0) { + if (prev_param != NULL) { + prev_param->next = param->next; + } else { + *params = param->next; + } + param->next = NULL; + spdk_iscsi_param_free(param); + return 0; + } + prev_param = param; + } + return -1; +} + +int +spdk_iscsi_param_add(struct iscsi_param **params, const char *key, + const char *val, const char *list, int type) +{ + struct iscsi_param *param, *last_param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add %s=%s, list=[%s], type=%d\n", + key, val, list, type); + if (key == NULL) { + return -1; + } + + param = spdk_iscsi_param_find(*params, key); + if (param != NULL) { + spdk_iscsi_param_del(params, key); + } + + param = calloc(1, sizeof(*param)); + if (!param) { + SPDK_ERRLOG("calloc() failed for parameter\n"); + return -ENOMEM; + } + + param->next = NULL; + param->key = xstrdup(key); + param->val = xstrdup(val); + param->list = xstrdup(list); + param->type = type; + + last_param = *params; + if (last_param != NULL) { + while (last_param->next != NULL) { + last_param = last_param->next; + } + last_param->next = param; + } else { + *params = param; + } + + return 0; +} + +int +spdk_iscsi_param_set(struct iscsi_param *params, const char *key, + const char *val) +{ + struct iscsi_param *param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%s\n", key, val); + param = spdk_iscsi_param_find(params, key); + if (param == NULL) { + SPDK_ERRLOG("no key %s\n", key); + return -1; + } + + free(param->val); + + param->val = xstrdup(val); + + return 0; +} + +int +spdk_iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val) +{ + char buf[MAX_TMPBUF]; + struct iscsi_param *param; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set %s=%d\n", key, val); + param = spdk_iscsi_param_find(params, key); + if (param == NULL) { + SPDK_ERRLOG("no key %s\n", key); + return -1; + } + + free(param->val); + snprintf(buf, sizeof buf, "%d", val); + + param->val = strdup(buf); + + return 0; +} + +/** + * Parse a single KEY=VAL pair + * + * data = "KEY=VAL" + */ +static int +spdk_iscsi_parse_param(struct iscsi_param **params, const uint8_t *data) +{ + int rc; + uint8_t *key_copy; + const uint8_t *key_end, *val; + int key_len, val_len; + int max_len; + + key_end = strchr(data, '='); + if (!key_end) { + SPDK_ERRLOG("'=' not found\n"); + return -1; + } + + key_len = key_end - data; + if (key_len == 0) { + SPDK_ERRLOG("Empty key\n"); + return -1; + } + /* + * RFC 7143 6.1 + */ + if (key_len > ISCSI_TEXT_MAX_KEY_LEN) { + SPDK_ERRLOG("Key name length is bigger than 63\n"); + return -1; + } + + key_copy = malloc(key_len + 1); + if (!key_copy) { + SPDK_ERRLOG("malloc() failed for key_copy\n"); + return -ENOMEM; + } + + memcpy(key_copy, data, key_len); + key_copy[key_len] = '\0'; + /* check whether this key is duplicated */ + if (NULL != spdk_iscsi_param_find(*params, key_copy)) { + SPDK_ERRLOG("Duplicated Key %s\n", key_copy); + free(key_copy); + return -1; + } + + val = key_end + 1; /* +1 to skip over the '=' */ + val_len = strlen(val); + /* + * RFC 3720 5.1 + * If not otherwise specified, the maximum length of a simple-value + * (not its encoded representation) is 255 bytes, not including the delimiter + * (comma or zero byte). + */ + /* + * comma or zero is counted in, otherwise we need to iterate each parameter + * value + */ + max_len = spdk_iscsi_find_key_in_array(key_copy, non_simple_value_params) ? + ISCSI_TEXT_MAX_VAL_LEN : ISCSI_TEXT_MAX_SIMPLE_VAL_LEN; + if (val_len > max_len) { + SPDK_ERRLOG("Overflow Val %d\n", val_len); + free(key_copy); + return -1; + } + + rc = spdk_iscsi_param_add(params, key_copy, val, NULL, 0); + free(key_copy); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return -1; + } + + /* return number of bytes consumed + * +1 for '=' and +1 for NUL + */ + return key_len + 1 + val_len + 1; +} + +/** + * Parse a sequence of KEY=VAL pairs. + * + * \param data "KEY=VALKEY=VAL..." + * \param len length of data in bytes + */ +int +spdk_iscsi_parse_params(struct iscsi_param **params, const uint8_t *data, + int len, bool cbit_enabled, char **partial_parameter) +{ + int rc, offset = 0; + char *p; + int i; + + /* strip the partial text parameters if previous PDU have C enabled */ + if (partial_parameter && *partial_parameter) { + for (i = 0; i < len && data[i] != '\0'; i++) { + ; + } + p = spdk_sprintf_alloc("%s%s", *partial_parameter, (const char *)data); + if (!p) { + return -1; + } + rc = spdk_iscsi_parse_param(params, p); + free(p); + if (rc < 0) { + return -1; + } + free(*partial_parameter); + *partial_parameter = NULL; + + data = data + i + 1; + len = len - (i + 1); + } + + /* strip the partial text parameters if C bit is enabled */ + if (cbit_enabled) { + if (partial_parameter == NULL) { + SPDK_ERRLOG("C bit set but no partial parameters provided\n"); + return -1; + } + + /* + * reverse iterate the string from the tail not including '\0' + * index of last '\0' is len -1. + */ + for (i = len - 2; data[i] != '\0' && i > 0; i--) { + ; + } + *partial_parameter = xstrdup(&data[i == 0 ? 0 : i + 1]); + len = (i == 0 ? 0 : i + 1); + } + + while (offset < len && data[offset] != '\0') { + rc = spdk_iscsi_parse_param(params, data + offset); + if (rc < 0) { + return -1; + } + offset += rc; + } + return 0; +} + +char * +spdk_iscsi_param_get_val(struct iscsi_param *params, const char *key) +{ + struct iscsi_param *param; + + param = spdk_iscsi_param_find(params, key); + if (param == NULL) { + return NULL; + } + return param->val; +} + +int +spdk_iscsi_param_eq_val(struct iscsi_param *params, const char *key, + const char *val) +{ + struct iscsi_param *param; + + param = spdk_iscsi_param_find(params, key); + if (param == NULL) { + return 0; + } + if (strcasecmp(param->val, val) == 0) { + return 1; + } + return 0; +} + +struct iscsi_param_table { + const char *key; + const char *val; + const char *list; + int type; +}; + +static const struct iscsi_param_table conn_param_table[] = { + { "HeaderDigest", "None", "CRC32C,None", ISPT_LIST }, + { "DataDigest", "None", "CRC32C,None", ISPT_LIST }, + { "MaxRecvDataSegmentLength", "8192", "512,16777215", ISPT_NUMERICAL_DECLARATIVE }, + { "OFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND }, + { "IFMarker", "No", "Yes,No", ISPT_BOOLEAN_AND }, + { "OFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN }, + { "IFMarkInt", "1", "1,65535", ISPT_NUMERICAL_MIN }, + { "AuthMethod", "None", "CHAP,None", ISPT_LIST }, + { "CHAP_A", "5", "5", ISPT_LIST }, + { "CHAP_N", "", "", ISPT_DECLARATIVE }, + { "CHAP_R", "", "", ISPT_DECLARATIVE }, + { "CHAP_I", "", "", ISPT_DECLARATIVE }, + { "CHAP_C", "", "", ISPT_DECLARATIVE }, + { NULL, NULL, NULL, ISPT_INVALID }, +}; + +static const struct iscsi_param_table sess_param_table[] = { + { "MaxConnections", "1", "1,65535", ISPT_NUMERICAL_MIN }, +#if 0 + /* need special handling */ + { "SendTargets", "", "", ISPT_DECLARATIVE }, +#endif + { "TargetName", "", "", ISPT_DECLARATIVE }, + { "InitiatorName", "", "", ISPT_DECLARATIVE }, + { "TargetAlias", "", "", ISPT_DECLARATIVE }, + { "InitiatorAlias", "", "", ISPT_DECLARATIVE }, + { "TargetAddress", "", "", ISPT_DECLARATIVE }, + { "TargetPortalGroupTag", "1", "1,65535", ISPT_NUMERICAL_DECLARATIVE }, + { "InitialR2T", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "ImmediateData", "Yes", "Yes,No", ISPT_BOOLEAN_AND }, + { "MaxBurstLength", "262144", "512,16777215", ISPT_NUMERICAL_MIN }, + { "FirstBurstLength", "65536", "512,16777215", ISPT_NUMERICAL_MIN }, + { "DefaultTime2Wait", "2", "0,3600", ISPT_NUMERICAL_MAX }, + { "DefaultTime2Retain", "20", "0,3600", ISPT_NUMERICAL_MIN }, + { "MaxOutstandingR2T", "1", "1,65536", ISPT_NUMERICAL_MIN }, + { "DataPDUInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "DataSequenceInOrder", "Yes", "Yes,No", ISPT_BOOLEAN_OR }, + { "ErrorRecoveryLevel", "0", "0,2", ISPT_NUMERICAL_MIN }, + { "SessionType", "Normal", "Normal,Discovery", ISPT_DECLARATIVE }, + { NULL, NULL, NULL, ISPT_INVALID }, +}; + +static int +spdk_iscsi_params_init_internal(struct iscsi_param **params, + const struct iscsi_param_table *table) +{ + int rc; + int i; + struct iscsi_param *param; + + for (i = 0; table[i].key != NULL; i++) { + rc = spdk_iscsi_param_add(params, table[i].key, table[i].val, + table[i].list, table[i].type); + if (rc < 0) { + SPDK_ERRLOG("iscsi_param_add() failed\n"); + return -1; + } + param = spdk_iscsi_param_find(*params, table[i].key); + if (param != NULL) { + param->state_index = i; + } else { + SPDK_ERRLOG("spdk_iscsi_param_find() failed\n"); + return -1; + } + } + + return 0; +} + +int +spdk_iscsi_conn_params_init(struct iscsi_param **params) +{ + return spdk_iscsi_params_init_internal(params, &conn_param_table[0]); +} + +int +spdk_iscsi_sess_params_init(struct iscsi_param **params) +{ + return spdk_iscsi_params_init_internal(params, &sess_param_table[0]); +} + +static const char *chap_type[] = { + "CHAP_A", + "CHAP_N", + "CHAP_R", + "CHAP_I", + "CHAP_C", + NULL, +}; + +static const char *discovery_ignored_param[] = { + "MaxConnections", + "InitialR2T", + "ImmediateData", + "MaxBurstLength", + "FirstBurstLength" + "MaxOutstandingR2T", + "DataPDUInOrder", + NULL, +}; + +static const char *multi_negot_conn_params[] = { + "MaxRecvDataSegmentLength", + NULL, +}; + +/* The following params should be declared by target */ +static const char *target_declarative_params[] = { + "TargetAlias", + "TargetAddress", + "TargetPortalGroupTag", + NULL, +}; + +/* This function is used to construct the data from the special param (e.g., + * MaxRecvDataSegmentLength) + * return: + * normal: the total len of the data + * error: -1 + */ +static int +spdk_iscsi_special_param_construction(struct spdk_iscsi_conn *conn, + struct iscsi_param *param, + bool FirstBurstLength_flag, char *data, + int alloc_len, int total) +{ + int len; + struct iscsi_param *param_first; + struct iscsi_param *param_max; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + char *val; + + val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!val) { + SPDK_ERRLOG("malloc() failed for temporary buffer\n"); + return -ENOMEM; + } + + if (strcasecmp(param->key, "MaxRecvDataSegmentLength") == 0) { + /* + * MaxRecvDataSegmentLength is sent by both + * initiator and target, but is declarative - meaning + * each direction can have different values. + * So when MaxRecvDataSegmentLength is found in the + * the parameter set sent from the initiator, add SPDK + * iscsi target's MaxRecvDataSegmentLength value to + * the returned parameter list. + */ + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + free(val); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "returning MaxRecvDataSegmentLength=%d\n", + SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + len = snprintf((char *)data + total, alloc_len - total, + "MaxRecvDataSegmentLength=%d", + SPDK_ISCSI_MAX_RECV_DATA_SEGMENT_LENGTH); + total += len + 1; + } + + if (strcasecmp(param->key, "MaxBurstLength") == 0 && + !FirstBurstLength_flag) { + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + free(val); + return -1; + } + + param_first = spdk_iscsi_param_find(conn->sess->params, + "FirstBurstLength"); + if (param_first != NULL) { + FirstBurstLength = (uint32_t)strtol(param_first->val, NULL, 10); + } else { + FirstBurstLength = SPDK_ISCSI_FIRST_BURST_LENGTH; + } + param_max = spdk_iscsi_param_find(conn->sess->params, + "MaxBurstLength"); + if (param_max != NULL) { + MaxBurstLength = (uint32_t)strtol(param_max->val, NULL, 10); + } else { + MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + } + + if (FirstBurstLength > MaxBurstLength) { + FirstBurstLength = MaxBurstLength; + if (param_first != NULL) { + free(param_first->val); + snprintf(val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + FirstBurstLength); + param_first->val = xstrdup(val); + } + } + len = snprintf((char *)data + total, alloc_len - total, + "FirstBurstLength=%d", FirstBurstLength); + total += len + 1; + } + + free(val); + return total; + +} + +/** + * spdk_iscsi_construct_data_from_param: + * To construct the data which will be returned to the initiator + * return: length of the negotiated data, -1 indicates error; + */ +static int +spdk_iscsi_construct_data_from_param(struct iscsi_param *param, char *new_val, + char *data, int alloc_len, int total) +{ + int len; + + if (param->type != ISPT_DECLARATIVE && + param->type != ISPT_NUMERICAL_DECLARATIVE) { + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "negotiated %s=%s\n", + param->key, new_val); + len = snprintf((char *)data + total, alloc_len - total, "%s=%s", + param->key, new_val); + total += len + 1; + } + return total; +} + +/** + * To negotiate param with + * type = ISPT_LIST + * return: the negotiated value of the key + */ +static char *spdk_iscsi_negotiate_param_list(int *add_param_value, + struct iscsi_param *param, + char *valid_list, char *in_val, + char *cur_val) +{ + char *val_start, *val_end; + char *in_start, *in_end; + int flag = 0; + + if (add_param_value == NULL) { + return NULL; + } + + in_start = in_val; + do { + if ((in_end = strchr(in_start, (int)',')) != NULL) { + *in_end = '\0'; + } + val_start = valid_list; + do { + if ((val_end = strchr(val_start, (int)',')) != NULL) { + *val_end = '\0'; + } + if (strcasecmp(in_start, val_start) == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "match %s\n", + val_start); + flag = 1; + break; + } + if (val_end) { + *val_end = ','; + val_start = val_end + 1; + } + } while (val_end); + if (flag) { + break; + } + if (in_end) { + *in_end = ','; + in_start = in_end + 1; + } + } while (in_end); + + return flag ? val_start : NULL; +} + +/** + * To negotiate param with + * type = ISPT_NUMERICAL_MIN/MAX, ISPT_NUMERICAL_DECLARATIVE + * return: the negotiated value of the key + */ +static char *spdk_iscsi_negotiate_param_numerical(int *add_param_value, + struct iscsi_param *param, + char *valid_list, char *in_val, + char *cur_val) +{ + char *valid_next; + char *new_val = NULL; + char *min_val, *max_val; + int val_i, cur_val_i; + int min_i, max_i; + + if (add_param_value == NULL) { + return NULL; + } + + val_i = (int)strtol(param->val, NULL, 10); + /* check whether the key is FirstBurstLength, if that we use in_val */ + if (strcasecmp(param->key, "FirstBurstLength") == 0) { + val_i = (int)strtol(in_val, NULL, 10); + } + + cur_val_i = (int)strtol(cur_val, NULL, 10); + valid_next = valid_list; + min_val = spdk_strsepq(&valid_next, ","); + max_val = spdk_strsepq(&valid_next, ","); + min_i = (min_val != NULL) ? (int)strtol(min_val, NULL, 10) : 0; + max_i = (max_val != NULL) ? (int)strtol(max_val, NULL, 10) : 0; + if (val_i < min_i || val_i > max_i) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "key %.64s reject\n", param->key); + new_val = NULL; + } else { + switch (param->type) { + case ISPT_NUMERICAL_MIN: + if (val_i > cur_val_i) { + val_i = cur_val_i; + } + break; + case ISPT_NUMERICAL_MAX: + if (val_i < cur_val_i) { + val_i = cur_val_i; + } + break; + default: + break; + } + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", val_i); + new_val = in_val; + } + + return new_val; +} + +/** + * To negotiate param with + * type = ISPT_BOOLEAN_OR, ISPT_BOOLEAN_AND + * return: the negotiated value of the key + */ +static char *spdk_iscsi_negotiate_param_boolean(int *add_param_value, + struct iscsi_param *param, + char *in_val, char *cur_val, + const char *value) +{ + char *new_val = NULL; + + if (add_param_value == NULL) { + return NULL; + } + + /* Make sure the val is Yes or No */ + if (!((strcasecmp(in_val, "Yes") == 0) || + (strcasecmp(in_val, "No") == 0))) { + /* unknown value */ + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Reject"); + new_val = in_val; + *add_param_value = 1; + return new_val; + } + + if (strcasecmp(cur_val, value) == 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", value); + new_val = in_val; + } else { + new_val = param->val; + } + + return new_val; +} + +/** + * The entry function to handle each type of the param + * return value: the new negotiated value + */ +static char * +spdk_iscsi_negotiate_param_all(int *add_param_value, struct iscsi_param *param, + char *valid_list, char *in_val, char *cur_val) +{ + char *new_val; + switch (param->type) { + case ISPT_LIST: + new_val = spdk_iscsi_negotiate_param_list(add_param_value, + param, + valid_list, + in_val, + cur_val); + break; + + case ISPT_NUMERICAL_MIN: + case ISPT_NUMERICAL_MAX: + case ISPT_NUMERICAL_DECLARATIVE: + new_val = spdk_iscsi_negotiate_param_numerical(add_param_value, + param, + valid_list, + in_val, + cur_val); + break; + + case ISPT_BOOLEAN_OR: + new_val = spdk_iscsi_negotiate_param_boolean(add_param_value, + param, + in_val, + cur_val, + "Yes"); + break; + case ISPT_BOOLEAN_AND: + new_val = spdk_iscsi_negotiate_param_boolean(add_param_value, + param, + in_val, + cur_val, + "No"); + break; + + default: + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + new_val = in_val; + break; + } + + return new_val; +} + +/** + * This function is used to judge whether the param is in session's params or + * connection's params + */ +static int +spdk_iscsi_negotiate_param_init(struct spdk_iscsi_conn *conn, + struct iscsi_param **cur_param_p, + struct iscsi_param **params_dst_p, + struct iscsi_param *param) +{ + int index; + + *cur_param_p = spdk_iscsi_param_find(*params_dst_p, param->key); + if (*cur_param_p == NULL) { + *params_dst_p = conn->sess->params; + *cur_param_p = spdk_iscsi_param_find(*params_dst_p, param->key); + if (*cur_param_p == NULL) { + if ((strncasecmp(param->key, "X-", 2) == 0) || + (strncasecmp(param->key, "X#", 2) == 0)) { + /* Extension Key */ + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "extension key %.64s\n", + param->key); + } else { + SPDK_ERRLOG("unknown key %.64s\n", param->key); + } + return 1; + } else { + index = (*cur_param_p)->state_index; + if (conn->sess_param_state_negotiated[index] && + !spdk_iscsi_find_key_in_array(param->key, + target_declarative_params)) { + return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE; + } + conn->sess_param_state_negotiated[index] = true; + } + } else { + index = (*cur_param_p)->state_index; + if (conn->conn_param_state_negotiated[index] && + !spdk_iscsi_find_key_in_array(param->key, + multi_negot_conn_params)) { + return SPDK_ISCSI_PARAMETER_EXCHANGE_NOT_ONCE; + } + conn->conn_param_state_negotiated[index] = true; + } + + return 0; +} + +int +spdk_iscsi_negotiate_params(struct spdk_iscsi_conn *conn, + struct iscsi_param **params, uint8_t *data, int alloc_len, + int data_len) +{ + struct iscsi_param *param; + struct iscsi_param *cur_param; + char *valid_list, *in_val; + char *cur_val; + char *new_val; + int discovery; + int total; + int rc; + uint32_t FirstBurstLength; + uint32_t MaxBurstLength; + bool FirstBurstLength_flag = false; + int type; + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + if (*params == NULL) { + /* no input */ + return total; + } + + /* discovery? */ + discovery = 0; + cur_param = spdk_iscsi_param_find(*params, "SessionType"); + if (cur_param == NULL) { + cur_param = spdk_iscsi_param_find(conn->sess->params, "SessionType"); + if (cur_param == NULL) { + /* no session type */ + } else { + if (strcasecmp(cur_param->val, "Discovery") == 0) { + discovery = 1; + } + } + } else { + if (strcasecmp(cur_param->val, "Discovery") == 0) { + discovery = 1; + } + } + + /* for temporary store */ + valid_list = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!valid_list) { + SPDK_ERRLOG("malloc() failed for valid_list\n"); + return -ENOMEM; + } + + in_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!in_val) { + SPDK_ERRLOG("malloc() failed for in_val\n"); + free(valid_list); + return -ENOMEM; + } + + cur_val = malloc(ISCSI_TEXT_MAX_VAL_LEN + 1); + if (!cur_val) { + SPDK_ERRLOG("malloc() failed for cur_val\n"); + free(valid_list); + free(in_val); + return -ENOMEM; + } + + /* To adjust the location of FirstBurstLength location and put it to + * the end, then we can always firstly determine the MaxBurstLength + */ + param = spdk_iscsi_param_find(*params, "MaxBurstLength"); + if (param != NULL) { + param = spdk_iscsi_param_find(*params, "FirstBurstLength"); + + /* check the existence of FirstBurstLength */ + if (param != NULL) { + FirstBurstLength_flag = true; + if (param->next != NULL) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + type = param->type; + spdk_iscsi_param_add(params, "FirstBurstLength", + in_val, NULL, type); + } + } + } + + for (param = *params; param != NULL; param = param->next) { + struct iscsi_param *params_dst = conn->params; + int add_param_value = 0; + new_val = NULL; + param->type = ISPT_INVALID; + + /* sendtargets is special */ + if (strcasecmp(param->key, "SendTargets") == 0) { + continue; + } + /* CHAP keys */ + if (spdk_iscsi_find_key_in_array(param->key, chap_type)) { + continue; + } + + /* 12.2, 12.10, 12.11, 12.13, 12.14, 12.17, 12.18, 12.19 */ + if (discovery && + spdk_iscsi_find_key_in_array(param->key, + discovery_ignored_param)) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "Irrelevant"); + new_val = in_val; + add_param_value = 1; + } else { + rc = spdk_iscsi_negotiate_param_init(conn, + &cur_param, + ¶ms_dst, + param); + if (rc < 0) { + free(valid_list); + free(in_val); + free(cur_val); + return rc; + } else if (rc > 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", "NotUnderstood"); + new_val = in_val; + add_param_value = 1; + } else { + snprintf(valid_list, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->list); + snprintf(cur_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", cur_param->val); + param->type = cur_param->type; + } + } + + if (param->type > 0) { + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN + 1, "%s", param->val); + + /* "NotUnderstood" value shouldn't be assigned to "Understood" key */ + if (strcasecmp(in_val, "NotUnderstood") == 0) { + free(in_val); + free(valid_list); + free(cur_val); + return SPDK_ISCSI_LOGIN_ERROR_PARAMETER; + } + + if (strcasecmp(param->key, "FirstBurstLength") == 0) { + FirstBurstLength = (uint32_t)strtol(param->val, NULL, + 10); + new_val = spdk_iscsi_param_get_val(conn->sess->params, + "MaxBurstLength"); + if (new_val != NULL) { + MaxBurstLength = (uint32_t) strtol(new_val, NULL, + 10); + } else { + MaxBurstLength = SPDK_ISCSI_MAX_BURST_LENGTH; + } + if (FirstBurstLength < MAX_FIRSTBURSTLENGTH && + FirstBurstLength > MaxBurstLength) { + FirstBurstLength = MaxBurstLength; + snprintf(in_val, ISCSI_TEXT_MAX_VAL_LEN, "%d", + FirstBurstLength); + } + } + + /* prevent target's declarative params from being changed by initiator */ + if (spdk_iscsi_find_key_in_array(param->key, target_declarative_params)) { + add_param_value = 1; + } + + new_val = spdk_iscsi_negotiate_param_all(&add_param_value, + param, + valid_list, + in_val, + cur_val); + } + + /* check the negotiated value of the key */ + if (new_val != NULL) { + /* add_param_value = 0 means updating the value of + * existed key in the connection's parameters + */ + if (add_param_value == 0) { + spdk_iscsi_param_set(params_dst, param->key, new_val); + } + total = spdk_iscsi_construct_data_from_param(param, + new_val, + data, + alloc_len, + total); + if (total < 0) { + goto final_return; + } + + total = spdk_iscsi_special_param_construction(conn, + param, + FirstBurstLength_flag, + data, + alloc_len, + total); + if (total < 0) { + goto final_return; + } + } else { + total = -1; + break; + } + } + +final_return: + free(valid_list); + free(in_val); + free(cur_val); + + return total; +} + +int +spdk_iscsi_copy_param2var(struct spdk_iscsi_conn *conn) +{ + const char *val; + + val = spdk_iscsi_param_get_val(conn->params, "MaxRecvDataSegmentLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxRecvDataSegmentLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "copy MaxRecvDataSegmentLength=%s\n", val); + conn->MaxRecvDataSegmentLength = (int)strtol(val, NULL, 10); + if (conn->MaxRecvDataSegmentLength > SPDK_ISCSI_MAX_SEND_DATA_SEGMENT_LENGTH) { + conn->MaxRecvDataSegmentLength = SPDK_ISCSI_MAX_SEND_DATA_SEGMENT_LENGTH; + } + + val = spdk_iscsi_param_get_val(conn->params, "HeaderDigest"); + if (val == NULL) { + SPDK_ERRLOG("Getval HeaderDigest failed\n"); + return -1; + } + if (strcasecmp(val, "CRC32C") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=1\n"); + conn->header_digest = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set HeaderDigest=0\n"); + conn->header_digest = 0; + } + val = spdk_iscsi_param_get_val(conn->params, "DataDigest"); + if (val == NULL) { + SPDK_ERRLOG("Getval DataDigest failed\n"); + return -1; + } + if (strcasecmp(val, "CRC32C") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=1\n"); + conn->data_digest = 1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set DataDigest=0\n"); + conn->data_digest = 0; + } + + val = spdk_iscsi_param_get_val(conn->sess->params, "MaxConnections"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxConnections failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxConnections=%s\n", val); + conn->sess->MaxConnections = (uint32_t) strtol(val, NULL, 10); + val = spdk_iscsi_param_get_val(conn->sess->params, "MaxOutstandingR2T"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxOutstandingR2T failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxOutstandingR2T=%s\n", val); + conn->sess->MaxOutstandingR2T = (uint32_t) strtol(val, NULL, 10); + val = spdk_iscsi_param_get_val(conn->sess->params, "FirstBurstLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval FirstBurstLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy FirstBurstLength=%s\n", val); + conn->sess->FirstBurstLength = (uint32_t) strtol(val, NULL, 10); + val = spdk_iscsi_param_get_val(conn->sess->params, "MaxBurstLength"); + if (val == NULL) { + SPDK_ERRLOG("Getval MaxBurstLength failed\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "copy MaxBurstLength=%s\n", val); + conn->sess->MaxBurstLength = (uint32_t) strtol(val, NULL, 10); + val = spdk_iscsi_param_get_val(conn->sess->params, "InitialR2T"); + if (val == NULL) { + SPDK_ERRLOG("Getval InitialR2T failed\n"); + return -1; + } + if (strcasecmp(val, "Yes") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=1\n"); + conn->sess->InitialR2T = true; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set InitialR2T=0\n"); + conn->sess->InitialR2T = false; + } + val = spdk_iscsi_param_get_val(conn->sess->params, "ImmediateData"); + if (val == NULL) { + SPDK_ERRLOG("Getval ImmediateData failed\n"); + return -1; + } + if (strcasecmp(val, "Yes") == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=1\n"); + conn->sess->ImmediateData = true; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "set ImmediateData=0\n"); + conn->sess->ImmediateData = false; + } + return 0; +} diff --git a/src/spdk/lib/iscsi/param.h b/src/spdk/lib/iscsi/param.h new file mode 100644 index 00000000..c9dc8cab --- /dev/null +++ b/src/spdk/lib/iscsi/param.h @@ -0,0 +1,84 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_PARAM_H +#define SPDK_ISCSI_PARAM_H + +#include "spdk/stdinc.h" + +enum iscsi_param_type { + ISPT_INVALID = -1, + ISPT_NOTSPECIFIED = 0, + ISPT_LIST, + ISPT_NUMERICAL_MIN, + ISPT_NUMERICAL_MAX, + ISPT_NUMERICAL_DECLARATIVE, + ISPT_DECLARATIVE, + ISPT_BOOLEAN_OR, + ISPT_BOOLEAN_AND, +}; + +struct iscsi_param { + struct iscsi_param *next; + char *key; + char *val; + char *list; + int type; + int state_index; +}; + +void +spdk_iscsi_param_free(struct iscsi_param *params); +struct iscsi_param * +spdk_iscsi_param_find(struct iscsi_param *params, const char *key); +int +spdk_iscsi_param_del(struct iscsi_param **params, const char *key); +int +spdk_iscsi_param_add(struct iscsi_param **params, const char *key, + const char *val, const char *list, int type); +int +spdk_iscsi_param_set(struct iscsi_param *params, const char *key, + const char *val); +int +spdk_iscsi_param_set_int(struct iscsi_param *params, const char *key, uint32_t val); +int +spdk_iscsi_parse_params(struct iscsi_param **params, const uint8_t *data, + int len, bool cbit_enabled, char **partial_parameter); +char * +spdk_iscsi_param_get_val(struct iscsi_param *params, const char *key); +int +spdk_iscsi_param_eq_val(struct iscsi_param *params, const char *key, + const char *val); + +#endif /* SPDK_ISCSI_PARAM_H */ diff --git a/src/spdk/lib/iscsi/portal_grp.c b/src/spdk/lib/iscsi/portal_grp.c new file mode 100644 index 00000000..60a724c9 --- /dev/null +++ b/src/spdk/lib/iscsi/portal_grp.c @@ -0,0 +1,707 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/sock.h" +#include "spdk/event.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/portal_grp.h" +#include "iscsi/acceptor.h" + +#define PORTNUMSTRLEN 32 + +static struct spdk_iscsi_portal * +spdk_iscsi_portal_find_by_addr(const char *host, const char *port) +{ + struct spdk_iscsi_portal *p; + + TAILQ_FOREACH(p, &g_spdk_iscsi.portal_head, g_tailq) { + if (!strcmp(p->host, host) && !strcmp(p->port, port)) { + return p; + } + } + + return NULL; +} + +/* Assumes caller allocated host and port strings on the heap */ +struct spdk_iscsi_portal * +spdk_iscsi_portal_create(const char *host, const char *port, const char *cpumask) +{ + struct spdk_iscsi_portal *p = NULL, *tmp; + struct spdk_cpuset *core_mask = NULL; + int rc; + + assert(host != NULL); + assert(port != NULL); + + + p = calloc(1, sizeof(*p)); + if (!p) { + SPDK_ERRLOG("calloc() failed for portal\n"); + return NULL; + } + + /* check and overwrite abbreviation of wildcard */ + if (strcasecmp(host, "[*]") == 0) { + SPDK_WARNLOG("Please use \"[::]\" as IPv6 wildcard\n"); + SPDK_WARNLOG("Convert \"[*]\" to \"[::]\" automatically\n"); + SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)"); + p->host = strdup("[::]"); + } else if (strcasecmp(host, "*") == 0) { + SPDK_WARNLOG("Please use \"0.0.0.0\" as IPv4 wildcard\n"); + SPDK_WARNLOG("Convert \"*\" to \"0.0.0.0\" automatically\n"); + SPDK_WARNLOG("(Use of \"[*]\" will be deprecated in a future release)"); + p->host = strdup("0.0.0.0"); + } else { + p->host = strdup(host); + } + if (!p->host) { + SPDK_ERRLOG("strdup() failed for host\n"); + goto error_out; + } + + p->port = strdup(port); + if (!p->port) { + SPDK_ERRLOG("strdup() failed for host\n"); + goto error_out; + } + + core_mask = spdk_cpuset_alloc(); + if (!core_mask) { + SPDK_ERRLOG("spdk_cpuset_alloc() failed for host\n"); + goto error_out; + } + + if (cpumask != NULL) { + rc = spdk_app_parse_core_mask(cpumask, core_mask); + if (rc < 0) { + SPDK_ERRLOG("cpumask (%s) is invalid\n", cpumask); + goto error_out; + } + if (spdk_cpuset_count(core_mask) == 0) { + SPDK_ERRLOG("cpumask (%s) does not contain core mask (0x%s)\n", + cpumask, spdk_cpuset_fmt(spdk_app_get_core_mask())); + goto error_out; + } + } else { + spdk_cpuset_copy(core_mask, spdk_app_get_core_mask()); + } + + p->cpumask = core_mask; + + p->sock = NULL; + p->group = NULL; /* set at a later time by caller */ + p->acceptor_poller = NULL; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + tmp = spdk_iscsi_portal_find_by_addr(host, port); + if (tmp != NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + SPDK_ERRLOG("portal (%s, %s) already exists\n", host, port); + goto error_out; + } + + TAILQ_INSERT_TAIL(&g_spdk_iscsi.portal_head, p, g_tailq); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return p; + +error_out: + spdk_cpuset_free(core_mask); + free(p->port); + free(p->host); + free(p); + + return NULL; +} + +void +spdk_iscsi_portal_destroy(struct spdk_iscsi_portal *p) +{ + assert(p != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_portal_destroy\n"); + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_REMOVE(&g_spdk_iscsi.portal_head, p, g_tailq); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + free(p->host); + free(p->port); + spdk_cpuset_free(p->cpumask); + free(p); + +} + +static int +spdk_iscsi_portal_open(struct spdk_iscsi_portal *p) +{ + struct spdk_sock *sock; + int port; + + if (p->sock != NULL) { + SPDK_ERRLOG("portal (%s, %s) is already opened\n", + p->host, p->port); + return -1; + } + + port = (int)strtol(p->port, NULL, 0); + sock = spdk_sock_listen(p->host, port); + if (sock == NULL) { + SPDK_ERRLOG("listen error %.64s.%d\n", p->host, port); + return -1; + } + + p->sock = sock; + + /* + * When the portal is created by config file, incoming connection + * requests for the socket are pended to accept until reactors start. + * However the gap between listen() and accept() will be slight and + * the requests will be queued by the nonzero backlog of the socket + * or resend by TCP. + */ + spdk_iscsi_acceptor_start(p); + + return 0; +} + +static void +spdk_iscsi_portal_close(struct spdk_iscsi_portal *p) +{ + if (p->sock) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "close portal (%s, %s)\n", + p->host, p->port); + spdk_iscsi_acceptor_stop(p); + spdk_sock_close(&p->sock); + } +} + +static int +spdk_iscsi_parse_portal(const char *portalstring, struct spdk_iscsi_portal **ip, + int dry_run) +{ + char *host = NULL, *port = NULL, *cpumask = NULL; + int len, rc = -1; + const char *p, *q; + + if (portalstring == NULL) { + SPDK_ERRLOG("portal error\n"); + goto error_out; + } + + /* IP address */ + if (portalstring[0] == '[') { + /* IPv6 */ + p = strchr(portalstring + 1, ']'); + if (p == NULL) { + SPDK_ERRLOG("portal error\n"); + goto error_out; + } + p++; + } else { + /* IPv4 */ + p = strchr(portalstring, ':'); + if (p == NULL) { + p = portalstring + strlen(portalstring); + } + } + + if (!dry_run) { + len = p - portalstring; + host = malloc(len + 1); + if (host == NULL) { + SPDK_ERRLOG("malloc() failed for host\n"); + goto error_out; + } + memcpy(host, portalstring, len); + host[len] = '\0'; + } + + /* Port number (IPv4 and IPv6 are the same) */ + if (p[0] == '\0') { + if (!dry_run) { + port = malloc(PORTNUMSTRLEN); + if (!port) { + SPDK_ERRLOG("malloc() failed for port\n"); + goto error_out; + } + snprintf(port, PORTNUMSTRLEN, "%d", DEFAULT_PORT); + } + } else { + if (p[0] != ':') { + SPDK_ERRLOG("portal error\n"); + goto error_out; + } + q = strchr(portalstring, '@'); + if (q == NULL) { + q = portalstring + strlen(portalstring); + } + if (q == p) { + SPDK_ERRLOG("no port specified\n"); + goto error_out; + } + + if (!dry_run) { + len = q - p - 1; + port = malloc(len + 1); + if (port == NULL) { + SPDK_ERRLOG("malloc() failed for port\n"); + goto error_out; + } + memcpy(port, p + 1, len); + port[len] = '\0'; + } + } + + /* Cpumask (IPv4 and IPv6 are the same) */ + p = strchr(portalstring, '@'); + if (p != NULL) { + q = portalstring + strlen(portalstring); + if (q == p) { + SPDK_ERRLOG("no cpumask specified\n"); + goto error_out; + } + if (!dry_run) { + len = q - p - 1; + cpumask = malloc(len + 1); + if (cpumask == NULL) { + SPDK_ERRLOG("malloc() failed for cpumask\n"); + goto error_out; + } + memcpy(cpumask, p + 1, len); + cpumask[len] = '\0'; + } + } + + if (!dry_run) { + *ip = spdk_iscsi_portal_create(host, port, cpumask); + if (!*ip) { + goto error_out; + } + } + + rc = 0; +error_out: + free(host); + free(port); + free(cpumask); + + return rc; +} + +struct spdk_iscsi_portal_grp * +spdk_iscsi_portal_grp_create(int tag) +{ + struct spdk_iscsi_portal_grp *pg = malloc(sizeof(*pg)); + + if (!pg) { + SPDK_ERRLOG("malloc() failed for portal group\n"); + return NULL; + } + + pg->ref = 0; + pg->tag = tag; + + TAILQ_INIT(&pg->head); + + return pg; +} + +void +spdk_iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + + assert(pg != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_portal_grp_destroy\n"); + while (!TAILQ_EMPTY(&pg->head)) { + p = TAILQ_FIRST(&pg->head); + TAILQ_REMOVE(&pg->head, p, per_pg_tailq); + spdk_iscsi_portal_destroy(p); + } + free(pg); +} + +int +spdk_iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg) +{ + int rc = -1; + struct spdk_iscsi_portal_grp *tmp; + + assert(pg != NULL); + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + tmp = spdk_iscsi_portal_grp_find_by_tag(pg->tag); + if (tmp == NULL) { + TAILQ_INSERT_TAIL(&g_spdk_iscsi.pg_head, pg, tailq); + rc = 0; + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return rc; +} + +void +spdk_iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, + struct spdk_iscsi_portal *p) +{ + assert(pg != NULL); + assert(p != NULL); + + p->group = pg; + TAILQ_INSERT_TAIL(&pg->head, p, per_pg_tailq); +} + +static int +spdk_iscsi_parse_portal_grp(struct spdk_conf_section *sp) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_portal *p; + const char *val; + char *label, *portal; + int portals = 0, i = 0, rc = 0; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add portal group (from config file) %d\n", + spdk_conf_section_get_num(sp)); + + val = spdk_conf_section_get_val(sp, "Comment"); + if (val != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "Comment %s\n", val); + } + + /* counts number of definitions */ + for (i = 0; ; i++) { + /* + * label is no longer used, but we keep it in the config + * file definition so that we do not break existing config + * files. + */ + label = spdk_conf_section_get_nmval(sp, "Portal", i, 0); + portal = spdk_conf_section_get_nmval(sp, "Portal", i, 1); + if (label == NULL || portal == NULL) { + break; + } + rc = spdk_iscsi_parse_portal(portal, &p, 1); + if (rc < 0) { + SPDK_ERRLOG("parse portal error (%s)\n", portal); + return -1; + } + } + + portals = i; + if (portals > MAX_PORTAL) { + SPDK_ERRLOG("%d > MAX_PORTAL\n", portals); + return -1; + } + + pg = spdk_iscsi_portal_grp_create(spdk_conf_section_get_num(sp)); + if (!pg) { + SPDK_ERRLOG("portal group malloc error (%s)\n", spdk_conf_section_get_name(sp)); + return -1; + } + + for (i = 0; i < portals; i++) { + label = spdk_conf_section_get_nmval(sp, "Portal", i, 0); + portal = spdk_conf_section_get_nmval(sp, "Portal", i, 1); + if (label == NULL || portal == NULL) { + SPDK_ERRLOG("portal error\n"); + goto error; + } + + rc = spdk_iscsi_parse_portal(portal, &p, 0); + if (rc < 0) { + SPDK_ERRLOG("parse portal error (%s)\n", portal); + goto error; + } + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "RIndex=%d, Host=%s, Port=%s, Tag=%d\n", + i, p->host, p->port, spdk_conf_section_get_num(sp)); + + spdk_iscsi_portal_grp_add_portal(pg, p); + } + + rc = spdk_iscsi_portal_grp_open(pg); + if (rc != 0) { + SPDK_ERRLOG("portal_grp_open failed\n"); + goto error; + } + + /* Add portal group to the end of the pg list */ + rc = spdk_iscsi_portal_grp_register(pg); + if (rc != 0) { + SPDK_ERRLOG("register portal failed\n"); + goto error; + } + + return 0; + +error: + spdk_iscsi_portal_grp_release(pg); + return -1; +} + +struct spdk_iscsi_portal_grp * +spdk_iscsi_portal_grp_find_by_tag(int tag) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + if (pg->tag == tag) { + return pg; + } + } + + return NULL; +} + +int +spdk_iscsi_parse_portal_grps(void) +{ + int rc = 0; + struct spdk_conf_section *sp; + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "PortalGroup")) { + if (spdk_conf_section_get_num(sp) == 0) { + SPDK_ERRLOG("Group 0 is invalid\n"); + return -1; + } + + /* Build portal group from cfg section PortalGroup */ + rc = spdk_iscsi_parse_portal_grp(sp); + if (rc < 0) { + SPDK_ERRLOG("parse_portal_group() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +spdk_iscsi_portal_grps_destroy(void) +{ + struct spdk_iscsi_portal_grp *pg; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_portal_grps_destroy\n"); + pthread_mutex_lock(&g_spdk_iscsi.mutex); + while (!TAILQ_EMPTY(&g_spdk_iscsi.pg_head)) { + pg = TAILQ_FIRST(&g_spdk_iscsi.pg_head); + TAILQ_REMOVE(&g_spdk_iscsi.pg_head, pg, tailq); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + spdk_iscsi_portal_grp_destroy(pg); + pthread_mutex_lock(&g_spdk_iscsi.mutex); + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); +} + +int +spdk_iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + int rc; + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + rc = spdk_iscsi_portal_open(p); + if (rc < 0) { + return rc; + } + } + return 0; +} + +static void +spdk_iscsi_portal_grp_close(struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_portal *p; + + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + spdk_iscsi_portal_close(p); + } +} + +void +spdk_iscsi_portal_grp_close_all(void) +{ + struct spdk_iscsi_portal_grp *pg; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_portal_grp_close_all\n"); + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + spdk_iscsi_portal_grp_close(pg); + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); +} + +struct spdk_iscsi_portal_grp * +spdk_iscsi_portal_grp_unregister(int tag) +{ + struct spdk_iscsi_portal_grp *pg; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + if (pg->tag == tag) { + TAILQ_REMOVE(&g_spdk_iscsi.pg_head, pg, tailq); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return pg; + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return NULL; +} + +void +spdk_iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg) +{ + spdk_iscsi_portal_grp_close(pg); + spdk_iscsi_portal_grp_destroy(pg); +} + +static const char *portal_group_section = \ + "\n" + "# Users must change the PortalGroup section(s) to match the IP addresses\n" + "# for their environment.\n" + "# PortalGroup sections define which network portals the iSCSI target\n" + "# will use to listen for incoming connections. These are also used to\n" + "# determine which targets are accessible over each portal group.\n" + "# Up to 1024 Portal directives are allowed. These define the network\n" + "# portals of the portal group. The user must specify a IP address\n" + "# for each network portal, and may optionally specify a port and\n" + "# a cpumask. If the port is omitted, 3260 will be used. Cpumask will\n" + "# be used to set the processor affinity of the iSCSI connection\n" + "# through the portal. If the cpumask is omitted, cpumask will be\n" + "# set to all available processors.\n" + "# Syntax:\n" + "# Portal [:[@]]\n"; + +#define PORTAL_GROUP_TMPL \ +"[PortalGroup%d]\n" \ +" Comment \"Portal%d\"\n" + +#define PORTAL_TMPL \ +" Portal DA1 %s:%s@0x%s\n" + +void +spdk_iscsi_portal_grps_config_text(FILE *fp) +{ + struct spdk_iscsi_portal *p = NULL; + struct spdk_iscsi_portal_grp *pg = NULL; + + /* Create portal group section */ + fprintf(fp, "%s", portal_group_section); + + /* Dump portal groups */ + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + if (NULL == pg) { continue; } + fprintf(fp, PORTAL_GROUP_TMPL, pg->tag, pg->tag); + /* Dump portals */ + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (NULL == p) { continue; } + fprintf(fp, PORTAL_TMPL, p->host, p->port, + spdk_cpuset_fmt(p->cpumask)); + } + } +} + +static void +spdk_iscsi_portal_grp_info_json(struct spdk_iscsi_portal_grp *pg, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal *portal; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_int32(w, "tag", pg->tag); + + spdk_json_write_named_array_begin(w, "portals"); + TAILQ_FOREACH(portal, &pg->head, per_pg_tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "host", portal->host); + spdk_json_write_named_string(w, "port", portal->port); + spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", + spdk_cpuset_fmt(portal->cpumask)); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); +} + +static void +spdk_iscsi_portal_grp_config_json(struct spdk_iscsi_portal_grp *pg, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "add_portal_group"); + + spdk_json_write_name(w, "params"); + spdk_iscsi_portal_grp_info_json(pg, w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + spdk_iscsi_portal_grp_info_json(pg, w); + } +} + +void +spdk_iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_portal_grp *pg; + + TAILQ_FOREACH(pg, &g_spdk_iscsi.pg_head, tailq) { + spdk_iscsi_portal_grp_config_json(pg, w); + } +} diff --git a/src/spdk/lib/iscsi/portal_grp.h b/src/spdk/lib/iscsi/portal_grp.h new file mode 100644 index 00000000..08cb3992 --- /dev/null +++ b/src/spdk/lib/iscsi/portal_grp.h @@ -0,0 +1,83 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_PORTAL_GRP_H +#define SPDK_PORTAL_GRP_H + +#include "spdk/conf.h" +#include "spdk/cpuset.h" + +struct spdk_json_write_ctx; + +struct spdk_iscsi_portal { + struct spdk_iscsi_portal_grp *group; + char *host; + char *port; + struct spdk_sock *sock; + struct spdk_cpuset *cpumask; + struct spdk_poller *acceptor_poller; + TAILQ_ENTRY(spdk_iscsi_portal) per_pg_tailq; + TAILQ_ENTRY(spdk_iscsi_portal) g_tailq; +}; + +struct spdk_iscsi_portal_grp { + int ref; + int tag; + TAILQ_ENTRY(spdk_iscsi_portal_grp) tailq; + TAILQ_HEAD(, spdk_iscsi_portal) head; +}; + +/* SPDK iSCSI Portal Group management API */ + +struct spdk_iscsi_portal *spdk_iscsi_portal_create(const char *host, const char *port, + const char *cpumask); +void spdk_iscsi_portal_destroy(struct spdk_iscsi_portal *p); + +struct spdk_iscsi_portal_grp *spdk_iscsi_portal_grp_create(int tag); +void spdk_iscsi_portal_grp_add_portal(struct spdk_iscsi_portal_grp *pg, + struct spdk_iscsi_portal *p); +void spdk_iscsi_portal_grp_destroy(struct spdk_iscsi_portal_grp *pg); +void spdk_iscsi_portal_grp_release(struct spdk_iscsi_portal_grp *pg); +int spdk_iscsi_parse_portal_grps(void); +void spdk_iscsi_portal_grps_destroy(void); +int spdk_iscsi_portal_grp_register(struct spdk_iscsi_portal_grp *pg); +struct spdk_iscsi_portal_grp *spdk_iscsi_portal_grp_unregister(int tag); +struct spdk_iscsi_portal_grp *spdk_iscsi_portal_grp_find_by_tag(int tag); +int spdk_iscsi_portal_grp_open(struct spdk_iscsi_portal_grp *pg); + +void spdk_iscsi_portal_grp_close_all(void); +void spdk_iscsi_portal_grps_config_text(FILE *fp); +void spdk_iscsi_portal_grps_info_json(struct spdk_json_write_ctx *w); +void spdk_iscsi_portal_grps_config_json(struct spdk_json_write_ctx *w); +#endif // SPDK_PORTAL_GRP_H diff --git a/src/spdk/lib/iscsi/task.c b/src/spdk/lib/iscsi/task.c new file mode 100644 index 00000000..6b56cd97 --- /dev/null +++ b/src/spdk/lib/iscsi/task.c @@ -0,0 +1,88 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/env.h" +#include "spdk/log.h" +#include "iscsi/conn.h" +#include "iscsi/task.h" + +static void +spdk_iscsi_task_free(struct spdk_scsi_task *scsi_task) +{ + struct spdk_iscsi_task *task = spdk_iscsi_task_from_scsi_task(scsi_task); + + if (task->parent) { + spdk_scsi_task_put(&task->parent->scsi); + task->parent = NULL; + } + + spdk_iscsi_task_disassociate_pdu(task); + assert(task->conn->pending_task_cnt > 0); + task->conn->pending_task_cnt--; + spdk_mempool_put(g_spdk_iscsi.task_pool, (void *)task); +} + +struct spdk_iscsi_task * +spdk_iscsi_task_get(struct spdk_iscsi_conn *conn, struct spdk_iscsi_task *parent, + spdk_scsi_task_cpl cpl_fn) +{ + struct spdk_iscsi_task *task; + + task = spdk_mempool_get(g_spdk_iscsi.task_pool); + if (!task) { + SPDK_ERRLOG("Unable to get task\n"); + abort(); + } + + memset(task, 0, sizeof(*task)); + task->conn = conn; + assert(conn->pending_task_cnt < UINT32_MAX); + conn->pending_task_cnt++; + spdk_scsi_task_construct(&task->scsi, + cpl_fn, + spdk_iscsi_task_free); + if (parent) { + parent->scsi.ref++; + task->parent = parent; + task->tag = parent->tag; + task->scsi.dxfer_dir = parent->scsi.dxfer_dir; + task->scsi.transfer_len = parent->scsi.transfer_len; + task->scsi.lun = parent->scsi.lun; + task->scsi.cdb = parent->scsi.cdb; + task->scsi.target_port = parent->scsi.target_port; + task->scsi.initiator_port = parent->scsi.initiator_port; + } + + return task; +} diff --git a/src/spdk/lib/iscsi/task.h b/src/spdk/lib/iscsi/task.h new file mode 100644 index 00000000..fea928ac --- /dev/null +++ b/src/spdk/lib/iscsi/task.h @@ -0,0 +1,187 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_TASK_H +#define SPDK_ISCSI_TASK_H + +#include "iscsi/iscsi.h" +#include "spdk/scsi.h" +#include "spdk/util.h" + +struct spdk_iscsi_task { + struct spdk_scsi_task scsi; + + struct spdk_iscsi_task *parent; + + struct spdk_iscsi_conn *conn; + struct spdk_iscsi_pdu *pdu; + uint32_t outstanding_r2t; + + uint32_t desired_data_transfer_length; + + /* Only valid for Read/Write */ + uint32_t bytes_completed; + + uint32_t data_out_cnt; + + /* + * Tracks the current offset of large read io. + */ + uint32_t current_datain_offset; + + /* + * next_expected_r2t_offset is used when we receive + * the DataOUT PDU. + */ + uint32_t next_expected_r2t_offset; + + /* + * Tracks the length of the R2T that is in progress. + * Used to check that an R2T burst does not exceed + * MaxBurstLength. + */ + uint32_t current_r2t_length; + + /* + * next_r2t_offset is used when we are sending the + * R2T packet to keep track of next offset of r2t. + */ + uint32_t next_r2t_offset; + uint32_t R2TSN; + uint32_t r2t_datasn; /* record next datasn for a r2tsn */ + uint32_t acked_r2tsn; /* next r2tsn to be acked */ + uint32_t datain_datasn; + uint32_t acked_data_sn; /* next expected datain datasn */ + uint32_t ttt; + + uint32_t tag; + + /** + * Record the lun id just in case the lun is invalid, + * which will happen when hot removing the lun. + */ + int lun_id; + + TAILQ_ENTRY(spdk_iscsi_task) link; + + TAILQ_HEAD(subtask_list, spdk_iscsi_task) subtask_list; + TAILQ_ENTRY(spdk_iscsi_task) subtask_link; + bool is_queued; /* is queued in scsi layer for handling */ +}; + +static inline void +spdk_iscsi_task_put(struct spdk_iscsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static inline struct spdk_iscsi_pdu * +spdk_iscsi_task_get_pdu(struct spdk_iscsi_task *task) +{ + return task->pdu; +} + +static inline void +spdk_iscsi_task_set_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu) +{ + task->pdu = pdu; +} + +static inline struct iscsi_bhs * +spdk_iscsi_task_get_bhs(struct spdk_iscsi_task *task) +{ + return &spdk_iscsi_task_get_pdu(task)->bhs; +} + +static inline void +spdk_iscsi_task_associate_pdu(struct spdk_iscsi_task *task, struct spdk_iscsi_pdu *pdu) +{ + spdk_iscsi_task_set_pdu(task, pdu); + pdu->ref++; +} + +static inline void +spdk_iscsi_task_disassociate_pdu(struct spdk_iscsi_task *task) +{ + if (spdk_iscsi_task_get_pdu(task)) { + spdk_put_pdu(spdk_iscsi_task_get_pdu(task)); + spdk_iscsi_task_set_pdu(task, NULL); + } +} + +static inline int +spdk_iscsi_task_is_immediate(struct spdk_iscsi_task *task) +{ + struct iscsi_bhs_scsi_req *scsi_req; + + scsi_req = (struct iscsi_bhs_scsi_req *)spdk_iscsi_task_get_bhs(task); + return (scsi_req->immediate == 1); +} + +static inline int +spdk_iscsi_task_is_read(struct spdk_iscsi_task *task) +{ + struct iscsi_bhs_scsi_req *scsi_req; + + scsi_req = (struct iscsi_bhs_scsi_req *)spdk_iscsi_task_get_bhs(task); + return (scsi_req->read_bit == 1); +} + +static inline uint32_t +spdk_iscsi_task_get_cmdsn(struct spdk_iscsi_task *task) +{ + return spdk_iscsi_task_get_pdu(task)->cmd_sn; +} + +struct spdk_iscsi_task *spdk_iscsi_task_get(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_task *parent, + spdk_scsi_task_cpl cpl_fn); + +static inline struct spdk_iscsi_task * +spdk_iscsi_task_from_scsi_task(struct spdk_scsi_task *task) +{ + return SPDK_CONTAINEROF(task, struct spdk_iscsi_task, scsi); +} + +static inline struct spdk_iscsi_task * +spdk_iscsi_task_get_primary(struct spdk_iscsi_task *task) +{ + if (task->parent) { + return task->parent; + } else { + return task; + } +} + +#endif /* SPDK_ISCSI_TASK_H */ diff --git a/src/spdk/lib/iscsi/tgt_node.c b/src/spdk/lib/iscsi/tgt_node.c new file mode 100644 index 00000000..97b5bbe1 --- /dev/null +++ b/src/spdk/lib/iscsi/tgt_node.c @@ -0,0 +1,1538 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/conf.h" +#include "spdk/sock.h" +#include "spdk/scsi.h" + +#include "spdk_internal/log.h" + +#include "iscsi/iscsi.h" +#include "iscsi/conn.h" +#include "iscsi/tgt_node.h" +#include "iscsi/portal_grp.h" +#include "iscsi/init_grp.h" +#include "iscsi/task.h" + +#define MAX_TMPBUF 1024 +#define MAX_MASKBUF 128 + +static bool +spdk_iscsi_ipv6_netmask_allow_addr(const char *netmask, const char *addr) +{ + struct in6_addr in6_mask; + struct in6_addr in6_addr; + char mask[MAX_MASKBUF]; + const char *p; + size_t n; + int bits, bmask; + int i; + + if (netmask[0] != '[') { + return false; + } + p = strchr(netmask, ']'); + if (p == NULL) { + return false; + } + n = p - (netmask + 1); + if (n + 1 > sizeof mask) { + return false; + } + + memcpy(mask, netmask + 1, n); + mask[n] = '\0'; + p++; + + if (p[0] == '/') { + bits = (int) strtol(p + 1, NULL, 10); + if (bits <= 0 || bits > 128) { + return false; + } + } else { + bits = 128; + } + +#if 0 + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "input %s\n", addr); + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "mask %s / %d\n", mask, bits); +#endif + + /* presentation to network order binary */ + if (inet_pton(AF_INET6, mask, &in6_mask) <= 0 + || inet_pton(AF_INET6, addr, &in6_addr) <= 0) { + return false; + } + + /* check 128bits */ + for (i = 0; i < (bits / 8); i++) { + if (in6_mask.s6_addr[i] != in6_addr.s6_addr[i]) { + return false; + } + } + if (bits % 8) { + bmask = (0xffU << (8 - (bits % 8))) & 0xffU; + if ((in6_mask.s6_addr[i] & bmask) != (in6_addr.s6_addr[i] & bmask)) { + return false; + } + } + + /* match */ + return true; +} + +static bool +spdk_iscsi_ipv4_netmask_allow_addr(const char *netmask, const char *addr) +{ + struct in_addr in4_mask; + struct in_addr in4_addr; + char mask[MAX_MASKBUF]; + const char *p; + uint32_t bmask; + size_t n; + int bits; + + p = strchr(netmask, '/'); + if (p == NULL) { + p = netmask + strlen(netmask); + } + n = p - netmask; + if (n + 1 > sizeof mask) { + return false; + } + + memcpy(mask, netmask, n); + mask[n] = '\0'; + + if (p[0] == '/') { + bits = (int) strtol(p + 1, NULL, 10); + if (bits <= 0 || bits > 32) { + return false; + } + } else { + bits = 32; + } + + /* presentation to network order binary */ + if (inet_pton(AF_INET, mask, &in4_mask) <= 0 + || inet_pton(AF_INET, addr, &in4_addr) <= 0) { + return false; + } + + /* check 32bits */ + bmask = (0xffffffffU << (32 - bits)) & 0xffffffffU; + if ((ntohl(in4_mask.s_addr) & bmask) != (ntohl(in4_addr.s_addr) & bmask)) { + return false; + } + + /* match */ + return true; +} + +static bool +spdk_iscsi_netmask_allow_addr(const char *netmask, const char *addr) +{ + if (netmask == NULL || addr == NULL) { + return false; + } + if (strcasecmp(netmask, "ANY") == 0) { + return true; + } + if (netmask[0] == '[') { + /* IPv6 */ + if (spdk_iscsi_ipv6_netmask_allow_addr(netmask, addr)) { + return true; + } + } else { + /* IPv4 */ + if (spdk_iscsi_ipv4_netmask_allow_addr(netmask, addr)) { + return true; + } + } + return false; +} + +static bool +spdk_iscsi_init_grp_allow_addr(struct spdk_iscsi_init_grp *igp, + const char *addr) +{ + struct spdk_iscsi_initiator_netmask *imask; + + TAILQ_FOREACH(imask, &igp->netmask_head, tailq) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "netmask=%s, addr=%s\n", + imask->mask, addr); + if (spdk_iscsi_netmask_allow_addr(imask->mask, addr)) { + return true; + } + } + return false; +} + +static int +spdk_iscsi_init_grp_allow_iscsi_name(struct spdk_iscsi_init_grp *igp, + const char *iqn, bool *result) +{ + struct spdk_iscsi_initiator_name *iname; + + TAILQ_FOREACH(iname, &igp->initiator_head, tailq) { + /* denied if iqn is matched */ + if ((iname->name[0] == '!') + && (strcasecmp(&iname->name[1], "ANY") == 0 + || strcasecmp(&iname->name[1], iqn) == 0)) { + *result = false; + return 0; + } + /* allowed if iqn is matched */ + if (strcasecmp(iname->name, "ANY") == 0 + || strcasecmp(iname->name, iqn) == 0) { + *result = true; + return 0; + } + } + return -1; +} + +static struct spdk_iscsi_pg_map * +spdk_iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg); + +bool +spdk_iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, const char *iqn, const char *addr) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int rc; + bool allowed = false; + + if (conn == NULL || target == NULL || iqn == NULL || addr == NULL) { + return false; + } + pg = conn->portal->group; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "pg=%d, iqn=%s, addr=%s\n", + pg->tag, iqn, addr); + pg_map = spdk_iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + return false; + } + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + rc = spdk_iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &allowed); + if (rc == 0) { + if (allowed == false) { + goto denied; + } else { + if (spdk_iscsi_init_grp_allow_addr(ig_map->ig, addr)) { + return true; + } + } + } else { + /* netmask is denied in this initiator group */ + } + } + +denied: + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "access denied from %s (%s) to %s (%s:%s,%d)\n", + iqn, addr, target->name, conn->portal->host, + conn->portal->port, conn->portal->group->tag); + return false; +} + +static bool +spdk_iscsi_tgt_node_allow_iscsi_name(struct spdk_iscsi_tgt_node *target, const char *iqn) +{ + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int rc; + bool result = false; + + if (target == NULL || iqn == NULL) { + return false; + } + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + rc = spdk_iscsi_init_grp_allow_iscsi_name(ig_map->ig, iqn, &result); + if (rc == 0) { + return result; + } + } + } + + return false; +} + +int +spdk_iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, + const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, + int data_len) +{ + char buf[MAX_TMPBUF]; + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_portal *p; + struct spdk_iscsi_tgt_node *target; + char *host; + int total; + int len; + int rc; + + if (conn == NULL) { + return 0; + } + + total = data_len; + if (alloc_len < 1) { + return 0; + } + if (total > alloc_len) { + total = alloc_len; + data[total - 1] = '\0'; + return total; + } + + if (alloc_len - total < 1) { + SPDK_ERRLOG("data space small %d\n", alloc_len); + return total; + } + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + if (strcasecmp(tiqn, "ALL") != 0 + && strcasecmp(tiqn, target->name) != 0) { + continue; + } + rc = spdk_iscsi_tgt_node_allow_iscsi_name(target, iiqn); + if (rc == 0) { + continue; + } + + /* DO SENDTARGETS */ + len = snprintf((char *) data + total, alloc_len - total, + "TargetName=%s", target->name); + total += len + 1; + + /* write to data */ + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + pg = pg_map->pg; + TAILQ_FOREACH(p, &pg->head, per_pg_tailq) { + if (alloc_len - total < 1) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + SPDK_ERRLOG("data space small %d\n", alloc_len); + return total; + } + host = p->host; + /* wildcard? */ + if (strcasecmp(host, "[::]") == 0 + || strcasecmp(host, "0.0.0.0") == 0) { + if (spdk_sock_is_ipv6(conn->sock)) { + snprintf(buf, sizeof buf, "[%s]", + conn->target_addr); + host = buf; + } else if (spdk_sock_is_ipv4(conn->sock)) { + snprintf(buf, sizeof buf, "%s", + conn->target_addr); + host = buf; + } else { + /* skip portal for the family */ + continue; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, + "TargetAddress=%s:%s,%d\n", + host, p->port, pg->tag); + len = snprintf((char *) data + total, + alloc_len - total, + "TargetAddress=%s:%s,%d", + host, p->port, pg->tag); + total += len + 1; + } + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return total; +} + +struct spdk_iscsi_tgt_node * +spdk_iscsi_find_tgt_node(const char *target_name) +{ + struct spdk_iscsi_tgt_node *target; + + if (target_name == NULL) { + return NULL; + } + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + if (strcasecmp(target_name, target->name) == 0) { + return target; + } + } + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "can't find target %s\n", target_name); + return NULL; +} + +static int +spdk_iscsi_tgt_node_register(struct spdk_iscsi_tgt_node *target) +{ + pthread_mutex_lock(&g_spdk_iscsi.mutex); + + if (spdk_iscsi_find_tgt_node(target->name) != NULL) { + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return -EEXIST; + } + + TAILQ_INSERT_TAIL(&g_spdk_iscsi.target_head, target, tailq); + + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return 0; +} + +static int +spdk_iscsi_tgt_node_unregister(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_tgt_node *t; + + TAILQ_FOREACH(t, &g_spdk_iscsi.target_head, tailq) { + if (t == target) { + TAILQ_REMOVE(&g_spdk_iscsi.target_head, t, tailq); + return 0; + } + } + + return -1; +} + +static struct spdk_iscsi_ig_map * +spdk_iscsi_pg_map_find_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + if (ig_map->ig == ig) { + return ig_map; + } + } + + return NULL; +} + +static struct spdk_iscsi_ig_map * +spdk_iscsi_pg_map_add_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + if (spdk_iscsi_pg_map_find_ig_map(pg_map, ig) != NULL) { + return NULL; + } + + ig_map = malloc(sizeof(*ig_map)); + if (ig_map == NULL) { + return NULL; + } + + ig_map->ig = ig; + ig->ref++; + pg_map->num_ig_maps++; + TAILQ_INSERT_TAIL(&pg_map->ig_map_head, ig_map, tailq); + + return ig_map; +} + +static void +_spdk_iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_ig_map *ig_map) +{ + TAILQ_REMOVE(&pg_map->ig_map_head, ig_map, tailq); + pg_map->num_ig_maps--; + ig_map->ig->ref--; + free(ig_map); +} + +static int +spdk_iscsi_pg_map_delete_ig_map(struct spdk_iscsi_pg_map *pg_map, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_ig_map *ig_map; + + ig_map = spdk_iscsi_pg_map_find_ig_map(pg_map, ig); + if (ig_map == NULL) { + return -ENOENT; + } + + _spdk_iscsi_pg_map_delete_ig_map(pg_map, ig_map); + return 0; +} + +static void +spdk_iscsi_pg_map_delete_all_ig_maps(struct spdk_iscsi_pg_map *pg_map) +{ + struct spdk_iscsi_ig_map *ig_map, *tmp; + + TAILQ_FOREACH_SAFE(ig_map, &pg_map->ig_map_head, tailq, tmp) { + _spdk_iscsi_pg_map_delete_ig_map(pg_map, ig_map); + } +} + +static struct spdk_iscsi_pg_map * +spdk_iscsi_tgt_node_find_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + if (pg_map->pg == pg) { + return pg_map; + } + } + + return NULL; +} + +static struct spdk_iscsi_pg_map * +spdk_iscsi_tgt_node_add_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + char port_name[MAX_TMPBUF]; + int rc; + + if (spdk_iscsi_tgt_node_find_pg_map(target, pg) != NULL) { + return NULL; + } + + if (target->num_pg_maps >= SPDK_SCSI_DEV_MAX_PORTS) { + SPDK_ERRLOG("Number of PG maps is more than allowed (max=%d)\n", + SPDK_SCSI_DEV_MAX_PORTS); + return NULL; + } + + pg_map = malloc(sizeof(*pg_map)); + if (pg_map == NULL) { + return NULL; + } + + snprintf(port_name, sizeof(port_name), "%s,t,0x%4.4x", + spdk_scsi_dev_get_name(target->dev), pg->tag); + rc = spdk_scsi_dev_add_port(target->dev, pg->tag, port_name); + if (rc != 0) { + free(pg_map); + return NULL; + } + + TAILQ_INIT(&pg_map->ig_map_head); + pg_map->num_ig_maps = 0; + pg->ref++; + pg_map->pg = pg; + target->num_pg_maps++; + TAILQ_INSERT_TAIL(&target->pg_map_head, pg_map, tailq); + + return pg_map; +} + +static void +_spdk_iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_pg_map *pg_map) +{ + TAILQ_REMOVE(&target->pg_map_head, pg_map, tailq); + target->num_pg_maps--; + pg_map->pg->ref--; + + spdk_scsi_dev_delete_port(target->dev, pg_map->pg->tag); + + free(pg_map); +} + +static int +spdk_iscsi_tgt_node_delete_pg_map(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_portal_grp *pg) +{ + struct spdk_iscsi_pg_map *pg_map; + + pg_map = spdk_iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + return -ENOENT; + } + + if (pg_map->num_ig_maps > 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "delete %d ig_maps forcefully\n", + pg_map->num_ig_maps); + } + + spdk_iscsi_pg_map_delete_all_ig_maps(pg_map); + _spdk_iscsi_tgt_node_delete_pg_map(target, pg_map); + return 0; +} + +static void +spdk_iscsi_tgt_node_delete_ig_maps(struct spdk_iscsi_tgt_node *target, + struct spdk_iscsi_init_grp *ig) +{ + struct spdk_iscsi_pg_map *pg_map, *tmp; + + TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) { + spdk_iscsi_pg_map_delete_ig_map(pg_map, ig); + if (pg_map->num_ig_maps == 0) { + _spdk_iscsi_tgt_node_delete_pg_map(target, pg_map); + } + } +} + +static void +spdk_iscsi_tgt_node_delete_all_pg_maps(struct spdk_iscsi_tgt_node *target) +{ + struct spdk_iscsi_pg_map *pg_map, *tmp; + + TAILQ_FOREACH_SAFE(pg_map, &target->pg_map_head, tailq, tmp) { + spdk_iscsi_pg_map_delete_all_ig_maps(pg_map); + _spdk_iscsi_tgt_node_delete_pg_map(target, pg_map); + } +} + +static void +spdk_iscsi_tgt_node_destruct(struct spdk_iscsi_tgt_node *target) +{ + if (target == NULL) { + return; + } + + free(target->name); + free(target->alias); + spdk_iscsi_tgt_node_delete_all_pg_maps(target); + spdk_scsi_dev_destruct(target->dev); + + pthread_mutex_destroy(&target->mutex); + free(target); +} + +static int +spdk_iscsi_tgt_node_delete_pg_ig_map(struct spdk_iscsi_tgt_node *target, + int pg_tag, int ig_tag) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + + pg = spdk_iscsi_portal_grp_find_by_tag(pg_tag); + if (pg == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag); + return -ENOENT; + } + ig = spdk_iscsi_init_grp_find_by_tag(ig_tag); + if (ig == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag); + return -ENOENT; + } + + pg_map = spdk_iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d is not mapped\n", target->name, pg_tag); + return -ENOENT; + } + ig_map = spdk_iscsi_pg_map_find_ig_map(pg_map, ig); + if (ig_map == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d is not mapped\n", target->name, pg_tag); + return -ENOENT; + } + + _spdk_iscsi_pg_map_delete_ig_map(pg_map, ig_map); + if (pg_map->num_ig_maps == 0) { + _spdk_iscsi_tgt_node_delete_pg_map(target, pg_map); + } + + return 0; +} + +static int +spdk_iscsi_tgt_node_add_pg_ig_map(struct spdk_iscsi_tgt_node *target, + int pg_tag, int ig_tag) +{ + struct spdk_iscsi_portal_grp *pg; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_init_grp *ig; + struct spdk_iscsi_ig_map *ig_map; + bool new_pg_map = false; + + pg = spdk_iscsi_portal_grp_find_by_tag(pg_tag); + if (pg == NULL) { + SPDK_ERRLOG("%s: PortalGroup%d not found\n", target->name, pg_tag); + return -ENOENT; + } + ig = spdk_iscsi_init_grp_find_by_tag(ig_tag); + if (ig == NULL) { + SPDK_ERRLOG("%s: InitiatorGroup%d not found\n", target->name, ig_tag); + return -ENOENT; + } + + /* get existing pg_map or create new pg_map and add it to target */ + pg_map = spdk_iscsi_tgt_node_find_pg_map(target, pg); + if (pg_map == NULL) { + pg_map = spdk_iscsi_tgt_node_add_pg_map(target, pg); + if (pg_map == NULL) { + goto failed; + } + new_pg_map = true; + } + + /* create new ig_map and add it to pg_map */ + ig_map = spdk_iscsi_pg_map_add_ig_map(pg_map, ig); + if (ig_map == NULL) { + goto failed; + } + + return 0; + +failed: + if (new_pg_map) { + _spdk_iscsi_tgt_node_delete_pg_map(target, pg_map); + } + + return -1; +} + +int +spdk_iscsi_tgt_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps) +{ + uint16_t i; + int rc; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + for (i = 0; i < num_maps; i++) { + rc = spdk_iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i], + ig_tag_list[i]); + if (rc != 0) { + SPDK_ERRLOG("could not add map to target\n"); + goto invalid; + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return 0; + +invalid: + for (; i > 0; --i) { + spdk_iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i - 1], + ig_tag_list[i - 1]); + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return -1; +} + +int +spdk_iscsi_tgt_node_delete_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps) +{ + uint16_t i; + int rc; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + for (i = 0; i < num_maps; i++) { + rc = spdk_iscsi_tgt_node_delete_pg_ig_map(target, pg_tag_list[i], + ig_tag_list[i]); + if (rc != 0) { + SPDK_ERRLOG("could not delete map from target\n"); + goto invalid; + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return 0; + +invalid: + for (; i > 0; --i) { + rc = spdk_iscsi_tgt_node_add_pg_ig_map(target, pg_tag_list[i - 1], + ig_tag_list[i - 1]); + if (rc != 0) { + spdk_iscsi_tgt_node_delete_all_pg_maps(target); + break; + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + return -1; +} + +static int +spdk_check_iscsi_name(const char *name) +{ + const unsigned char *up = (const unsigned char *) name; + size_t n; + + /* valid iSCSI name? */ + for (n = 0; up[n] != 0; n++) { + if (up[n] > 0x00U && up[n] <= 0x2cU) { + return -1; + } + if (up[n] == 0x2fU) { + return -1; + } + if (up[n] >= 0x3bU && up[n] <= 0x40U) { + return -1; + } + if (up[n] >= 0x5bU && up[n] <= 0x60U) { + return -1; + } + if (up[n] >= 0x7bU && up[n] <= 0x7fU) { + return -1; + } + if (isspace(up[n])) { + return -1; + } + } + /* valid format? */ + if (strncasecmp(name, "iqn.", 4) == 0) { + /* iqn.YYYY-MM.reversed.domain.name */ + if (!isdigit(up[4]) || !isdigit(up[5]) || !isdigit(up[6]) + || !isdigit(up[7]) || up[8] != '-' || !isdigit(up[9]) + || !isdigit(up[10]) || up[11] != '.') { + SPDK_ERRLOG("invalid iqn format. " + "expect \"iqn.YYYY-MM.reversed.domain.name\"\n"); + return -1; + } + } else if (strncasecmp(name, "eui.", 4) == 0) { + /* EUI-64 -> 16bytes */ + /* XXX */ + } else if (strncasecmp(name, "naa.", 4) == 0) { + /* 64bit -> 16bytes, 128bit -> 32bytes */ + /* XXX */ + } + /* OK */ + return 0; +} + +bool +spdk_iscsi_check_chap_params(bool disable, bool require, bool mutual, int group) +{ + if (group < 0) { + SPDK_ERRLOG("Invalid auth group ID (%d)\n", group); + return false; + } + if ((!disable && !require && !mutual) || /* Auto */ + (disable && !require && !mutual) || /* None */ + (!disable && require && !mutual) || /* CHAP */ + (!disable && require && mutual)) { /* CHAP Mutual */ + return true; + } + SPDK_ERRLOG("Invalid combination of CHAP params (d=%d,r=%d,m=%d)\n", + disable, require, mutual); + return false; +} + +_spdk_iscsi_tgt_node * +spdk_iscsi_tgt_node_construct(int target_index, + const char *name, const char *alias, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps, + const char *bdev_name_list[], int *lun_id_list, int num_luns, + int queue_depth, + bool disable_chap, bool require_chap, bool mutual_chap, int chap_group, + bool header_digest, bool data_digest) +{ + char fullname[MAX_TMPBUF]; + struct spdk_iscsi_tgt_node *target; + int rc; + + if (!spdk_iscsi_check_chap_params(disable_chap, require_chap, + mutual_chap, chap_group)) { + return NULL; + } + + if (num_maps == 0) { + SPDK_ERRLOG("num_maps = 0\n"); + return NULL; + } + + if (name == NULL) { + SPDK_ERRLOG("TargetName not found\n"); + return NULL; + } + + if (strncasecmp(name, "iqn.", 4) != 0 + && strncasecmp(name, "eui.", 4) != 0 + && strncasecmp(name, "naa.", 4) != 0) { + snprintf(fullname, sizeof(fullname), "%s:%s", g_spdk_iscsi.nodebase, name); + } else { + snprintf(fullname, sizeof(fullname), "%s", name); + } + + if (spdk_check_iscsi_name(fullname) != 0) { + SPDK_ERRLOG("TargetName %s contains an invalid character or format.\n", + name); + return NULL; + } + + target = malloc(sizeof(*target)); + if (!target) { + SPDK_ERRLOG("could not allocate target\n"); + return NULL; + } + + memset(target, 0, sizeof(*target)); + + rc = pthread_mutex_init(&target->mutex, NULL); + if (rc != 0) { + SPDK_ERRLOG("tgt_node%d: mutex_init() failed\n", target->num); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + + target->num = target_index; + + target->name = strdup(fullname); + if (!target->name) { + SPDK_ERRLOG("Could not allocate TargetName\n"); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + + if (alias == NULL) { + target->alias = NULL; + } else { + target->alias = strdup(alias); + if (!target->alias) { + SPDK_ERRLOG("Could not allocate TargetAlias\n"); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + } + + target->dev = spdk_scsi_dev_construct(fullname, bdev_name_list, lun_id_list, num_luns, + SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI, NULL, NULL); + if (!target->dev) { + SPDK_ERRLOG("Could not construct SCSI device\n"); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + + TAILQ_INIT(&target->pg_map_head); + rc = spdk_iscsi_tgt_node_add_pg_ig_maps(target, pg_tag_list, ig_tag_list, num_maps); + if (rc != 0) { + SPDK_ERRLOG("could not add map to target\n"); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + + target->disable_chap = disable_chap; + target->require_chap = require_chap; + target->mutual_chap = mutual_chap; + target->chap_group = chap_group; + target->header_digest = header_digest; + target->data_digest = data_digest; + + if (queue_depth > 0 && ((uint32_t)queue_depth <= g_spdk_iscsi.MaxQueueDepth)) { + target->queue_depth = queue_depth; + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "QueueDepth %d is invalid and %d is used instead.\n", + queue_depth, g_spdk_iscsi.MaxQueueDepth); + target->queue_depth = g_spdk_iscsi.MaxQueueDepth; + } + + rc = spdk_iscsi_tgt_node_register(target); + if (rc != 0) { + SPDK_ERRLOG("register target is failed\n"); + spdk_iscsi_tgt_node_destruct(target); + return NULL; + } + + return target; +} + +static int +spdk_iscsi_parse_tgt_node(struct spdk_conf_section *sp) +{ + char buf[MAX_TMPBUF]; + struct spdk_iscsi_tgt_node *target; + int pg_tag_list[MAX_TARGET_MAP], ig_tag_list[MAX_TARGET_MAP]; + int num_target_maps; + const char *alias, *pg_tag, *ig_tag; + const char *ag_tag; + const char *val, *name; + int target_num, chap_group, pg_tag_i, ig_tag_i; + bool header_digest, data_digest; + bool disable_chap, require_chap, mutual_chap; + int i; + int lun_id_list[SPDK_SCSI_DEV_MAX_LUN]; + const char *bdev_name_list[SPDK_SCSI_DEV_MAX_LUN]; + int num_luns, queue_depth; + + target_num = spdk_conf_section_get_num(sp); + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "add unit %d\n", target_num); + + data_digest = false; + header_digest = false; + + name = spdk_conf_section_get_val(sp, "TargetName"); + + if (name == NULL) { + SPDK_ERRLOG("tgt_node%d: TargetName not found\n", target_num); + return -1; + } + + alias = spdk_conf_section_get_val(sp, "TargetAlias"); + + /* Setup initiator and portal group mapping */ + val = spdk_conf_section_get_val(sp, "Mapping"); + if (val == NULL) { + /* no map */ + SPDK_ERRLOG("tgt_node%d: no Mapping\n", target_num); + return -1; + } + + for (i = 0; i < MAX_TARGET_MAP; i++) { + val = spdk_conf_section_get_nmval(sp, "Mapping", i, 0); + if (val == NULL) { + break; + } + pg_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 0); + ig_tag = spdk_conf_section_get_nmval(sp, "Mapping", i, 1); + if (pg_tag == NULL || ig_tag == NULL) { + SPDK_ERRLOG("tgt_node%d: mapping error\n", target_num); + return -1; + } + if (strncasecmp(pg_tag, "PortalGroup", + strlen("PortalGroup")) != 0 + || sscanf(pg_tag, "%*[^0-9]%d", &pg_tag_i) != 1) { + SPDK_ERRLOG("tgt_node%d: mapping portal error\n", target_num); + return -1; + } + if (strncasecmp(ig_tag, "InitiatorGroup", + strlen("InitiatorGroup")) != 0 + || sscanf(ig_tag, "%*[^0-9]%d", &ig_tag_i) != 1) { + SPDK_ERRLOG("tgt_node%d: mapping initiator error\n", target_num); + return -1; + } + if (pg_tag_i < 1 || ig_tag_i < 1) { + SPDK_ERRLOG("tgt_node%d: invalid group tag\n", target_num); + return -1; + } + pg_tag_list[i] = pg_tag_i; + ig_tag_list[i] = ig_tag_i; + } + + num_target_maps = i; + + /* Setup AuthMethod */ + val = spdk_conf_section_get_val(sp, "AuthMethod"); + disable_chap = false; + require_chap = false; + mutual_chap = false; + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "AuthMethod", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "CHAP") == 0) { + require_chap = true; + } else if (strcasecmp(val, "Mutual") == 0) { + mutual_chap = true; + } else if (strcasecmp(val, "Auto") == 0) { + disable_chap = false; + require_chap = false; + mutual_chap = false; + } else if (strcasecmp(val, "None") == 0) { + disable_chap = true; + require_chap = false; + mutual_chap = false; + } else { + SPDK_ERRLOG("tgt_node%d: unknown auth\n", target_num); + return -1; + } + } + if (mutual_chap && !require_chap) { + SPDK_ERRLOG("tgt_node%d: Mutual but not CHAP\n", target_num); + return -1; + } + } + if (disable_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod None\n"); + } else if (!require_chap) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthMethod CHAP %s\n", + mutual_chap ? "Mutual" : ""); + } + + val = spdk_conf_section_get_val(sp, "AuthGroup"); + if (val == NULL) { + chap_group = 0; + } else { + ag_tag = val; + if (strcasecmp(ag_tag, "None") == 0) { + chap_group = 0; + } else { + if (strncasecmp(ag_tag, "AuthGroup", + strlen("AuthGroup")) != 0 + || sscanf(ag_tag, "%*[^0-9]%d", &chap_group) != 1) { + SPDK_ERRLOG("tgt_node%d: auth group error\n", target_num); + return -1; + } + if (chap_group == 0) { + SPDK_ERRLOG("tgt_node%d: invalid auth group 0\n", target_num); + return -1; + } + } + } + if (chap_group == 0) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup None\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "AuthGroup AuthGroup%d\n", chap_group); + } + + val = spdk_conf_section_get_val(sp, "UseDigest"); + if (val != NULL) { + for (i = 0; ; i++) { + val = spdk_conf_section_get_nmval(sp, "UseDigest", 0, i); + if (val == NULL) { + break; + } + if (strcasecmp(val, "Header") == 0) { + header_digest = true; + } else if (strcasecmp(val, "Data") == 0) { + data_digest = true; + } else if (strcasecmp(val, "Auto") == 0) { + header_digest = false; + data_digest = false; + } else { + SPDK_ERRLOG("tgt_node%d: unknown digest\n", target_num); + return -1; + } + } + } + if (!header_digest && !data_digest) { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest Auto\n"); + } else { + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "UseDigest %s %s\n", + header_digest ? "Header" : "", + data_digest ? "Data" : ""); + } + + val = spdk_conf_section_get_val(sp, "QueueDepth"); + if (val == NULL) { + queue_depth = g_spdk_iscsi.MaxQueueDepth; + } else { + queue_depth = (int) strtol(val, NULL, 10); + } + + num_luns = 0; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + snprintf(buf, sizeof(buf), "LUN%d", i); + val = spdk_conf_section_get_val(sp, buf); + if (val == NULL) { + continue; + } + + bdev_name_list[num_luns] = val; + lun_id_list[num_luns] = i; + num_luns++; + } + + if (num_luns == 0) { + SPDK_ERRLOG("tgt_node%d: No LUN specified for target %s.\n", target_num, name); + return -1; + } + + target = spdk_iscsi_tgt_node_construct(target_num, name, alias, + pg_tag_list, ig_tag_list, num_target_maps, + bdev_name_list, lun_id_list, num_luns, queue_depth, + disable_chap, require_chap, mutual_chap, chap_group, + header_digest, data_digest); + + if (target == NULL) { + SPDK_ERRLOG("tgt_node%d: add_iscsi_target_node error\n", target_num); + return -1; + } + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (lun) { + SPDK_INFOLOG(SPDK_LOG_ISCSI, "device %d: LUN%d %s\n", + spdk_scsi_dev_get_id(target->dev), + spdk_scsi_lun_get_id(lun), + spdk_scsi_lun_get_bdev_name(lun)); + } + } + + return 0; +} + +int spdk_iscsi_parse_tgt_nodes(void) +{ + struct spdk_conf_section *sp; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_ISCSI, "spdk_iscsi_parse_tgt_nodes\n"); + + sp = spdk_conf_first_section(NULL); + while (sp != NULL) { + if (spdk_conf_section_match_prefix(sp, "TargetNode")) { + int tag = spdk_conf_section_get_num(sp); + + if (tag > SPDK_TN_TAG_MAX) { + SPDK_ERRLOG("tag %d is invalid\n", tag); + return -1; + } + rc = spdk_iscsi_parse_tgt_node(sp); + if (rc < 0) { + SPDK_ERRLOG("spdk_iscsi_parse_tgt_node() failed\n"); + return -1; + } + } + sp = spdk_conf_next_section(sp); + } + return 0; +} + +void +spdk_iscsi_shutdown_tgt_nodes(void) +{ + struct spdk_iscsi_tgt_node *target, *tmp; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH_SAFE(target, &g_spdk_iscsi.target_head, tailq, tmp) { + TAILQ_REMOVE(&g_spdk_iscsi.target_head, target, tailq); + spdk_iscsi_tgt_node_destruct(target); + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); +} + +int +spdk_iscsi_shutdown_tgt_node_by_name(const char *target_name) +{ + struct spdk_iscsi_tgt_node *target; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + target = spdk_iscsi_find_tgt_node(target_name); + if (target != NULL) { + spdk_iscsi_tgt_node_unregister(target); + spdk_iscsi_tgt_node_destruct(target); + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return 0; + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); + + return -ENOENT; +} + +int +spdk_iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target) +{ + int i; + struct spdk_iscsi_task *task; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (!lun) { + continue; + } + + /* we create a fake management task per LUN to cleanup */ + task = spdk_iscsi_task_get(conn, NULL, spdk_iscsi_task_mgmt_cpl); + if (!task) { + SPDK_ERRLOG("Unable to acquire task\n"); + return -1; + } + + task->scsi.target_port = conn->target_port; + task->scsi.initiator_port = conn->initiator_port; + task->scsi.lun = lun; + + spdk_scsi_dev_queue_mgmt_task(target->dev, &task->scsi, SPDK_SCSI_TASK_FUNC_LUN_RESET); + } + + return 0; +} + +void spdk_iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group, + struct spdk_iscsi_init_grp *initiator_group) +{ + struct spdk_iscsi_tgt_node *target; + + pthread_mutex_lock(&g_spdk_iscsi.mutex); + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + if (portal_group) { + spdk_iscsi_tgt_node_delete_pg_map(target, portal_group); + } + if (initiator_group) { + spdk_iscsi_tgt_node_delete_ig_maps(target, initiator_group); + } + } + pthread_mutex_unlock(&g_spdk_iscsi.mutex); +} + +int +spdk_iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target, + const char *bdev_name, int lun_id) +{ + struct spdk_scsi_dev *dev; + int rc; + + if (target->num_active_conns > 0) { + SPDK_ERRLOG("Target has active connections (count=%d)\n", + target->num_active_conns); + return -1; + } + + if (lun_id < -1 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) { + SPDK_ERRLOG("Specified LUN ID (%d) is invalid\n", lun_id); + return -1; + } + + dev = target->dev; + if (dev == NULL) { + SPDK_ERRLOG("SCSI device is not found\n"); + return -1; + } + + rc = spdk_scsi_dev_add_lun(dev, bdev_name, lun_id, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("spdk_scsi_dev_add_lun failed\n"); + return -1; + } + + return 0; +} + +int +spdk_iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group) +{ + if (!spdk_iscsi_check_chap_params(disable_chap, require_chap, + mutual_chap, chap_group)) { + return -EINVAL; + } + + pthread_mutex_lock(&target->mutex); + target->disable_chap = disable_chap; + target->require_chap = require_chap; + target->mutual_chap = mutual_chap; + target->chap_group = chap_group; + pthread_mutex_unlock(&target->mutex); + + return 0; +} + +static const char *target_nodes_section = \ + "\n" + "# Users should change the TargetNode section(s) below to match the\n" + "# desired iSCSI target node configuration.\n" + "# TargetName, Mapping, LUN0 are minimum required\n"; + +#define TARGET_NODE_TMPL \ +"[TargetNode%d]\n" \ +" Comment \"Target%d\"\n" \ +" TargetName %s\n" \ +" TargetAlias \"%s\"\n" + +#define TARGET_NODE_PGIG_MAPPING_TMPL \ +" Mapping PortalGroup%d InitiatorGroup%d\n" + +#define TARGET_NODE_AUTH_TMPL \ +" AuthMethod %s\n" \ +" AuthGroup %s\n" \ +" UseDigest %s\n" + +#define TARGET_NODE_QD_TMPL \ +" QueueDepth %d\n\n" + +#define TARGET_NODE_LUN_TMPL \ +" LUN%d %s\n" + +void +spdk_iscsi_tgt_nodes_config_text(FILE *fp) +{ + int l = 0; + struct spdk_scsi_dev *dev = NULL; + struct spdk_iscsi_tgt_node *target = NULL; + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + + /* Create target nodes section */ + fprintf(fp, "%s", target_nodes_section); + + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + int idx; + const char *authmethod = "None"; + char authgroup[32] = "None"; + const char *usedigest = "Auto"; + + dev = target->dev; + if (NULL == dev) { continue; } + + idx = target->num; + fprintf(fp, TARGET_NODE_TMPL, idx, idx, target->name, spdk_scsi_dev_get_name(dev)); + + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + fprintf(fp, TARGET_NODE_PGIG_MAPPING_TMPL, + pg_map->pg->tag, + ig_map->ig->tag); + } + } + + if (target->disable_chap) { + authmethod = "None"; + } else if (!target->require_chap) { + authmethod = "Auto"; + } else if (target->mutual_chap) { + authmethod = "CHAP Mutual"; + } else { + authmethod = "CHAP"; + } + + if (target->chap_group > 0) { + snprintf(authgroup, sizeof(authgroup), "AuthGroup%d", target->chap_group); + } + + if (target->header_digest) { + usedigest = "Header"; + } else if (target->data_digest) { + usedigest = "Data"; + } + + fprintf(fp, TARGET_NODE_AUTH_TMPL, + authmethod, authgroup, usedigest); + + for (l = 0; l < SPDK_SCSI_DEV_MAX_LUN; l++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(dev, l); + + if (!lun) { + continue; + } + + fprintf(fp, TARGET_NODE_LUN_TMPL, + spdk_scsi_lun_get_id(lun), + spdk_scsi_lun_get_bdev_name(lun)); + } + + fprintf(fp, TARGET_NODE_QD_TMPL, + target->queue_depth); + } +} + +static void +spdk_iscsi_tgt_node_info_json(struct spdk_iscsi_tgt_node *target, + struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_pg_map *pg_map; + struct spdk_iscsi_ig_map *ig_map; + int i; + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "name", target->name); + + if (target->alias) { + spdk_json_write_named_string(w, "alias_name", target->alias); + } + + spdk_json_write_named_array_begin(w, "pg_ig_maps"); + TAILQ_FOREACH(pg_map, &target->pg_map_head, tailq) { + TAILQ_FOREACH(ig_map, &pg_map->ig_map_head, tailq) { + spdk_json_write_object_begin(w); + spdk_json_write_named_int32(w, "pg_tag", pg_map->pg->tag); + spdk_json_write_named_int32(w, "ig_tag", ig_map->ig->tag); + spdk_json_write_object_end(w); + } + } + spdk_json_write_array_end(w); + + spdk_json_write_named_array_begin(w, "luns"); + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + struct spdk_scsi_lun *lun = spdk_scsi_dev_get_lun(target->dev, i); + + if (lun) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_named_int32(w, "lun_id", spdk_scsi_lun_get_id(lun)); + spdk_json_write_object_end(w); + } + } + spdk_json_write_array_end(w); + + spdk_json_write_named_int32(w, "queue_depth", target->queue_depth); + + spdk_json_write_named_bool(w, "disable_chap", target->disable_chap); + spdk_json_write_named_bool(w, "require_chap", target->require_chap); + spdk_json_write_named_bool(w, "mutual_chap", target->mutual_chap); + spdk_json_write_named_int32(w, "chap_group", target->chap_group); + + spdk_json_write_named_bool(w, "header_digest", target->header_digest); + spdk_json_write_named_bool(w, "data_digest", target->data_digest); + + spdk_json_write_object_end(w); +} + +static void +spdk_iscsi_tgt_node_config_json(struct spdk_iscsi_tgt_node *target, + struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "construct_target_node"); + + spdk_json_write_name(w, "params"); + spdk_iscsi_tgt_node_info_json(target, w); + + spdk_json_write_object_end(w); +} + +void +spdk_iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_tgt_node *target; + + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + spdk_iscsi_tgt_node_info_json(target, w); + } +} + +void +spdk_iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_iscsi_tgt_node *target; + + TAILQ_FOREACH(target, &g_spdk_iscsi.target_head, tailq) { + spdk_iscsi_tgt_node_config_json(target, w); + } +} diff --git a/src/spdk/lib/iscsi/tgt_node.h b/src/spdk/lib/iscsi/tgt_node.h new file mode 100644 index 00000000..1d54922a --- /dev/null +++ b/src/spdk/lib/iscsi/tgt_node.h @@ -0,0 +1,146 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_ISCSI_TGT_NODE_H_ +#define SPDK_ISCSI_TGT_NODE_H_ + +#include "spdk/stdinc.h" + +#include "spdk/scsi.h" + +struct spdk_iscsi_conn; +struct spdk_iscsi_init_grp; +struct spdk_iscsi_portal_grp; +struct spdk_iscsi_portal; +struct spdk_json_write_ctx; + +#define MAX_TARGET_MAP 256 +#define SPDK_TN_TAG_MAX 0x0000ffff + +struct spdk_iscsi_ig_map { + struct spdk_iscsi_init_grp *ig; + TAILQ_ENTRY(spdk_iscsi_ig_map) tailq; +}; + +struct spdk_iscsi_pg_map { + struct spdk_iscsi_portal_grp *pg; + int num_ig_maps; + TAILQ_HEAD(, spdk_iscsi_ig_map) ig_map_head; + TAILQ_ENTRY(spdk_iscsi_pg_map) tailq ; +}; + +struct spdk_iscsi_tgt_node { + int num; + char *name; + char *alias; + + pthread_mutex_t mutex; + + bool disable_chap; + bool require_chap; + bool mutual_chap; + int chap_group; + bool header_digest; + bool data_digest; + int queue_depth; + + struct spdk_scsi_dev *dev; + /** + * Counts number of active iSCSI connections associated with this + * target node. + */ + uint32_t num_active_conns; + int lcore; + + int num_pg_maps; + TAILQ_HEAD(, spdk_iscsi_pg_map) pg_map_head; + TAILQ_ENTRY(spdk_iscsi_tgt_node) tailq; +}; + +int spdk_iscsi_parse_tgt_nodes(void); + +void spdk_iscsi_shutdown_tgt_nodes(void); +int spdk_iscsi_shutdown_tgt_node_by_name(const char *target_name); +int spdk_iscsi_send_tgts(struct spdk_iscsi_conn *conn, const char *iiqn, + const char *iaddr, const char *tiqn, uint8_t *data, int alloc_len, + int data_len); + +/* This typedef exists to work around an astyle 2.05 bug. + * Remove it when astyle is fixed. + */ +typedef struct spdk_iscsi_tgt_node _spdk_iscsi_tgt_node; + +/* + * bdev_name_list and lun_id_list are equal sized arrays of size num_luns. + * bdev_name_list refers to the names of the bdevs that will be used for the LUNs on the + * new target node. + * lun_id_list refers to the LUN IDs that will be used for the LUNs on the target node. + */ +_spdk_iscsi_tgt_node * +spdk_iscsi_tgt_node_construct(int target_index, + const char *name, const char *alias, + int *pg_tag_list, int *ig_tag_list, uint16_t num_maps, + const char *bdev_name_list[], int *lun_id_list, int num_luns, + int queue_depth, + bool disable_chap, bool require_chap, bool mutual_chap, int chap_group, + bool header_digest, bool data_digest); + +bool spdk_iscsi_check_chap_params(bool disable, bool require, bool mutual, int group); + +int spdk_iscsi_tgt_node_add_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, + uint16_t num_maps); +int spdk_iscsi_tgt_node_delete_pg_ig_maps(struct spdk_iscsi_tgt_node *target, + int *pg_tag_list, int *ig_tag_list, + uint16_t num_maps); + +bool spdk_iscsi_tgt_node_access(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target, const char *iqn, + const char *addr); +struct spdk_iscsi_tgt_node *spdk_iscsi_find_tgt_node(const char *target_name); +int spdk_iscsi_tgt_node_reset(struct spdk_iscsi_tgt_node *target, + uint64_t lun); +int spdk_iscsi_tgt_node_cleanup_luns(struct spdk_iscsi_conn *conn, + struct spdk_iscsi_tgt_node *target); +void spdk_iscsi_tgt_node_delete_map(struct spdk_iscsi_portal_grp *portal_group, + struct spdk_iscsi_init_grp *initiator_group); +int spdk_iscsi_tgt_node_add_lun(struct spdk_iscsi_tgt_node *target, + const char *bdev_name, int lun_id); +int spdk_iscsi_tgt_node_set_chap_params(struct spdk_iscsi_tgt_node *target, + bool disable_chap, bool require_chap, + bool mutual_chap, int32_t chap_group); +void spdk_iscsi_tgt_nodes_config_text(FILE *fp); +void spdk_iscsi_tgt_nodes_info_json(struct spdk_json_write_ctx *w); +void spdk_iscsi_tgt_nodes_config_json(struct spdk_json_write_ctx *w); +#endif /* SPDK_ISCSI_TGT_NODE_H_ */ diff --git a/src/spdk/lib/json/Makefile b/src/spdk/lib/json/Makefile new file mode 100644 index 00000000..8808df9e --- /dev/null +++ b/src/spdk/lib/json/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = json_parse.c json_util.c json_write.c +LIBNAME = json + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/json/json_parse.c b/src/spdk/lib/json/json_parse.c new file mode 100644 index 00000000..8639d5ff --- /dev/null +++ b/src/spdk/lib/json/json_parse.c @@ -0,0 +1,668 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" + +#define SPDK_JSON_MAX_NESTING_DEPTH 64 + +static int +hex_value(uint8_t c) +{ +#define V(x, y) [x] = y + 1 + static const int8_t val[256] = { + V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4), + V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9), + V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF), + V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF), + }; +#undef V + + return val[c] - 1; +} + +static int +json_decode_string_escape_unicode(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + uint8_t *str = *strp; + int v0, v1, v2, v3; + uint32_t val; + uint32_t surrogate_high = 0; + int rc; +decode: + /* \uXXXX */ + assert(buf_end > str); + + if (*str++ != '\\') { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if (*str++ != 'u') { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v3 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v2 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v1 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + if ((v0 = hex_value(*str++)) < 0) { return SPDK_JSON_PARSE_INVALID; } + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + + val = v0 | (v1 << 4) | (v2 << 8) | (v3 << 12); + + if (surrogate_high) { + /* We already parsed the high surrogate, so this should be the low part. */ + if (!utf16_valid_surrogate_low(val)) { + return SPDK_JSON_PARSE_INVALID; + } + + /* Convert UTF-16 surrogate pair into codepoint and fall through to utf8_encode. */ + val = utf16_decode_surrogate_pair(surrogate_high, val); + } else if (utf16_valid_surrogate_high(val)) { + surrogate_high = val; + + /* + * We parsed a \uXXXX sequence that decoded to the first half of a + * UTF-16 surrogate pair, so it must be immediately followed by another + * \uXXXX escape. + * + * Loop around to get the low half of the surrogate pair. + */ + if (buf_end == str) { return SPDK_JSON_PARSE_INCOMPLETE; } + goto decode; + } else if (utf16_valid_surrogate_low(val)) { + /* + * We found the second half of surrogate pair without the first half; + * this is an invalid encoding. + */ + return SPDK_JSON_PARSE_INVALID; + } + + /* + * Convert Unicode escape (or surrogate pair) to UTF-8 in place. + * + * This is safe (will not write beyond the buffer) because the \uXXXX sequence is 6 bytes + * (or 12 bytes for surrogate pairs), and the longest possible UTF-8 encoding of a + * single codepoint is 4 bytes. + */ + if (out) { + rc = utf8_encode_unsafe(out, val); + } else { + rc = utf8_codepoint_len(val); + } + if (rc < 0) { + return SPDK_JSON_PARSE_INVALID; + } + + *strp = str; /* update input pointer */ + return rc; /* return number of bytes decoded */ +} + +static int +json_decode_string_escape_twochar(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + static const uint8_t escapes[256] = { + ['b'] = '\b', + ['f'] = '\f', + ['n'] = '\n', + ['r'] = '\r', + ['t'] = '\t', + ['/'] = '/', + ['"'] = '"', + ['\\'] = '\\', + }; + uint8_t *str = *strp; + uint8_t c; + + assert(buf_end > str); + if (buf_end - str < 2) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + assert(str[0] == '\\'); + + c = escapes[str[1]]; + if (c) { + if (out) { + *out = c; + } + *strp += 2; /* consumed two bytes */ + return 1; /* produced one byte */ + } + + return SPDK_JSON_PARSE_INVALID; +} + +/* + * Decode JSON string backslash escape. + * \param strp pointer to pointer to first character of escape (the backslash). + * *strp is also advanced to indicate how much input was consumed. + * + * \return Number of bytes appended to out + */ +static int +json_decode_string_escape(uint8_t **strp, uint8_t *buf_end, uint8_t *out) +{ + int rc; + + rc = json_decode_string_escape_twochar(strp, buf_end, out); + if (rc > 0) { + return rc; + } + + return json_decode_string_escape_unicode(strp, buf_end, out); +} + +/* + * Decode JSON string in place. + * + * \param str_start Pointer to the beginning of the string (the opening " character). + * + * \return Number of bytes in decoded string (beginning from start). + */ +static int +json_decode_string(uint8_t *str_start, uint8_t *buf_end, uint8_t **str_end, uint32_t flags) +{ + uint8_t *str = str_start; + uint8_t *out = str_start + 1; /* Decode string in place (skip the initial quote) */ + int rc; + + if (buf_end - str_start < 2) { + /* + * Shortest valid string (the empty string) is two bytes (""), + * so this can't possibly be valid + */ + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (*str++ != '"') { + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } + + while (str < buf_end) { + if (str[0] == '"') { + /* + * End of string. + * Update str_end to point at next input byte and return output length. + */ + *str_end = str + 1; + return out - str_start - 1; + } else if (str[0] == '\\') { + rc = json_decode_string_escape(&str, buf_end, + flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE ? out : NULL); + assert(rc != 0); + if (rc < 0) { + *str_end = str; + return rc; + } + out += rc; + } else if (str[0] <= 0x1f) { + /* control characters must be escaped */ + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } else { + rc = utf8_valid(str, buf_end); + if (rc == 0) { + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; + } else if (rc < 0) { + *str_end = str; + return SPDK_JSON_PARSE_INVALID; + } + + if (out && out != str && (flags & SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE)) { + memmove(out, str, rc); + } + out += rc; + str += rc; + } + } + + /* If execution gets here, we ran out of buffer. */ + *str_end = str; + return SPDK_JSON_PARSE_INCOMPLETE; +} + +static int +json_valid_number(uint8_t *start, uint8_t *buf_end) +{ + uint8_t *p = start; + uint8_t c; + + if (p >= buf_end) { return -1; } + + c = *p++; + if (c >= '1' && c <= '9') { goto num_int_digits; } + if (c == '0') { goto num_frac_or_exp; } + if (c == '-') { goto num_int_first_digit; } + p--; + goto done_invalid; + +num_int_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c == '0') { goto num_frac_or_exp; } + if (c >= '1' && c <= '9') { goto num_int_digits; } + p--; + } + goto done_invalid; + +num_int_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_int_digits; } + if (c == '.') { goto num_frac_first_digit; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_frac_or_exp: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c == '.') { goto num_frac_first_digit; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_frac_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_frac_digits; } + p--; + } + goto done_invalid; + +num_frac_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_frac_digits; } + if (c == 'e' || c == 'E') { goto num_exp_sign; } + p--; + } + goto done_valid; + +num_exp_sign: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + if (c == '-' || c == '+') { goto num_exp_first_digit; } + p--; + } + goto done_invalid; + +num_exp_first_digit: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + p--; + } + goto done_invalid; + +num_exp_digits: + if (spdk_likely(p != buf_end)) { + c = *p++; + if (c >= '0' && c <= '9') { goto num_exp_digits; } + p--; + } + goto done_valid; + +done_valid: + /* Valid end state */ + return p - start; + +done_invalid: + /* Invalid end state */ + if (p == buf_end) { + /* Hit the end of the buffer - the stream is incomplete. */ + return SPDK_JSON_PARSE_INCOMPLETE; + } + + /* Found an invalid character in an invalid end state */ + return SPDK_JSON_PARSE_INVALID; +} + +static int +json_valid_comment(const uint8_t *start, const uint8_t *buf_end) +{ + const uint8_t *p = start; + bool multiline; + + assert(buf_end > p); + if (buf_end - p < 2) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (p[0] != '/') { + return SPDK_JSON_PARSE_INVALID; + } + if (p[1] == '*') { + multiline = true; + } else if (p[1] == '/') { + multiline = false; + } else { + return SPDK_JSON_PARSE_INVALID; + } + p += 2; + + if (multiline) { + while (p != buf_end - 1) { + if (p[0] == '*' && p[1] == '/') { + /* Include the terminating star and slash in the comment */ + return p - start + 2; + } + p++; + } + } else { + while (p != buf_end) { + if (*p == '\r' || *p == '\n') { + /* Do not include the line terminator in the comment */ + return p - start; + } + p++; + } + } + + return SPDK_JSON_PARSE_INCOMPLETE; +} + +struct json_literal { + enum spdk_json_val_type type; + uint32_t len; + uint8_t str[8]; +}; + +/* + * JSON only defines 3 possible literals; they can be uniquely identified by bits + * 3 and 4 of the first character: + * 'f' = 0b11[00]110 + * 'n' = 0b11[01]110 + * 't' = 0b11[10]100 + * These two bits can be used as an index into the g_json_literals array. + */ +static const struct json_literal g_json_literals[] = { + {SPDK_JSON_VAL_FALSE, 5, "false"}, + {SPDK_JSON_VAL_NULL, 4, "null"}, + {SPDK_JSON_VAL_TRUE, 4, "true"}, + {} +}; + +static int +match_literal(const uint8_t *start, const uint8_t *end, const uint8_t *literal, size_t len) +{ + assert(end >= start); + if ((size_t)(end - start) < len) { + return SPDK_JSON_PARSE_INCOMPLETE; + } + + if (memcmp(start, literal, len) != 0) { + return SPDK_JSON_PARSE_INVALID; + } + + return len; +} + +ssize_t +spdk_json_parse(void *json, size_t size, struct spdk_json_val *values, size_t num_values, + void **end, uint32_t flags) +{ + uint8_t *json_end = json + size; + enum spdk_json_val_type containers[SPDK_JSON_MAX_NESTING_DEPTH]; + size_t con_value[SPDK_JSON_MAX_NESTING_DEPTH]; + enum spdk_json_val_type con_type = SPDK_JSON_VAL_INVALID; + bool trailing_comma = false; + size_t depth = 0; /* index into containers */ + size_t cur_value = 0; /* index into values */ + size_t con_start_value; + uint8_t *data = json; + uint8_t *new_data; + int rc = 0; + const struct json_literal *lit; + enum { + STATE_VALUE, /* initial state */ + STATE_VALUE_SEPARATOR, /* value separator (comma) */ + STATE_NAME, /* "name": value */ + STATE_NAME_SEPARATOR, /* colon */ + STATE_END, /* parsed the complete value, so only whitespace is valid */ + } state = STATE_VALUE; + +#define ADD_VALUE(t, val_start_ptr, val_end_ptr) \ + if (values && cur_value < num_values) { \ + values[cur_value].type = t; \ + values[cur_value].start = val_start_ptr; \ + values[cur_value].len = val_end_ptr - val_start_ptr; \ + } \ + cur_value++ + + while (data < json_end) { + uint8_t c = *data; + + switch (c) { + case ' ': + case '\t': + case '\r': + case '\n': + /* Whitespace is allowed between any tokens. */ + data++; + break; + + case 't': + case 'f': + case 'n': + /* true, false, or null */ + if (state != STATE_VALUE) { goto done_invalid; } + lit = &g_json_literals[(c >> 3) & 3]; /* See comment above g_json_literals[] */ + assert(lit->str[0] == c); + rc = match_literal(data, json_end, lit->str, lit->len); + if (rc < 0) { goto done_rc; } + ADD_VALUE(lit->type, data, data + rc); + data += rc; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case '"': + if (state != STATE_VALUE && state != STATE_NAME) { goto done_invalid; } + rc = json_decode_string(data, json_end, &new_data, flags); + if (rc < 0) { + data = new_data; + goto done_rc; + } + /* + * Start is data + 1 to skip initial quote. + * Length is data + rc - 1 to skip both quotes. + */ + ADD_VALUE(state == STATE_VALUE ? SPDK_JSON_VAL_STRING : SPDK_JSON_VAL_NAME, + data + 1, data + rc - 1); + data = new_data; + if (state == STATE_NAME) { + state = STATE_NAME_SEPARATOR; + } else { + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + } + trailing_comma = false; + break; + + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (state != STATE_VALUE) { goto done_invalid; } + rc = json_valid_number(data, json_end); + if (rc < 0) { goto done_rc; } + ADD_VALUE(SPDK_JSON_VAL_NUMBER, data, data + rc); + data += rc; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case '{': + case '[': + if (state != STATE_VALUE) { goto done_invalid; } + if (depth == SPDK_JSON_MAX_NESTING_DEPTH) { + rc = SPDK_JSON_PARSE_MAX_DEPTH_EXCEEDED; + goto done_rc; + } + if (c == '{') { + con_type = SPDK_JSON_VAL_OBJECT_BEGIN; + state = STATE_NAME; + } else { + con_type = SPDK_JSON_VAL_ARRAY_BEGIN; + state = STATE_VALUE; + } + con_value[depth] = cur_value; + containers[depth++] = con_type; + ADD_VALUE(con_type, data, data + 1); + data++; + trailing_comma = false; + break; + + case '}': + case ']': + if (trailing_comma) { goto done_invalid; } + if (depth == 0) { goto done_invalid; } + con_type = containers[--depth]; + con_start_value = con_value[depth]; + if (values && con_start_value < num_values) { + values[con_start_value].len = cur_value - con_start_value - 1; + } + if (c == '}') { + if (state != STATE_NAME && state != STATE_VALUE_SEPARATOR) { + goto done_invalid; + } + if (con_type != SPDK_JSON_VAL_OBJECT_BEGIN) { + goto done_invalid; + } + ADD_VALUE(SPDK_JSON_VAL_OBJECT_END, data, data + 1); + } else { + if (state != STATE_VALUE && state != STATE_VALUE_SEPARATOR) { + goto done_invalid; + } + if (con_type != SPDK_JSON_VAL_ARRAY_BEGIN) { + goto done_invalid; + } + ADD_VALUE(SPDK_JSON_VAL_ARRAY_END, data, data + 1); + } + con_type = depth == 0 ? SPDK_JSON_VAL_INVALID : containers[depth - 1]; + data++; + state = depth ? STATE_VALUE_SEPARATOR : STATE_END; + trailing_comma = false; + break; + + case ',': + if (state != STATE_VALUE_SEPARATOR) { goto done_invalid; } + data++; + assert(con_type == SPDK_JSON_VAL_ARRAY_BEGIN || + con_type == SPDK_JSON_VAL_OBJECT_BEGIN); + state = con_type == SPDK_JSON_VAL_ARRAY_BEGIN ? STATE_VALUE : STATE_NAME; + trailing_comma = true; + break; + + case ':': + if (state != STATE_NAME_SEPARATOR) { goto done_invalid; } + data++; + state = STATE_VALUE; + break; + + case '/': + if (!(flags & SPDK_JSON_PARSE_FLAG_ALLOW_COMMENTS)) { + goto done_invalid; + } + rc = json_valid_comment(data, json_end); + if (rc < 0) { goto done_rc; } + /* Skip over comment */ + data += rc; + break; + + default: + goto done_invalid; + } + + if (state == STATE_END) { + break; + } + } + + if (state == STATE_END) { + /* Skip trailing whitespace */ + while (data < json_end) { + uint8_t c = *data; + + if (c == ' ' || c == '\t' || c == '\r' || c == '\n') { + data++; + } else { + break; + } + } + + /* + * These asserts are just for sanity checking - they are guaranteed by the allowed + * state transitions. + */ + assert(depth == 0); + assert(trailing_comma == false); + assert(data <= json_end); + if (end) { + *end = data; + } + return cur_value; + } + + /* Invalid end state - ran out of data */ + rc = SPDK_JSON_PARSE_INCOMPLETE; + +done_rc: + assert(rc < 0); + if (end) { + *end = data; + } + return rc; + +done_invalid: + rc = SPDK_JSON_PARSE_INVALID; + goto done_rc; +} diff --git a/src/spdk/lib/json/json_util.c b/src/spdk/lib/json/json_util.c new file mode 100644 index 00000000..1146e6fa --- /dev/null +++ b/src/spdk/lib/json/json_util.c @@ -0,0 +1,650 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" +#include "spdk_internal/log.h" + +#define SPDK_JSON_DEBUG(...) SPDK_DEBUGLOG(SPDK_LOG_JSON_UTIL, __VA_ARGS__) + +size_t +spdk_json_val_len(const struct spdk_json_val *val) +{ + if (val == NULL) { + return 0; + } + + if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN || val->type == SPDK_JSON_VAL_OBJECT_BEGIN) { + return val->len + 2; + } + + return 1; +} + +bool +spdk_json_strequal(const struct spdk_json_val *val, const char *str) +{ + size_t len; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) { + return false; + } + + len = strlen(str); + if (val->len != len) { + return false; + } + + return memcmp(val->start, str, len) == 0; +} + +char * +spdk_json_strdup(const struct spdk_json_val *val) +{ + size_t len; + char *s; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NAME) { + return NULL; + } + + len = val->len; + + if (memchr(val->start, '\0', len)) { + /* String contains embedded NUL, so it is not a valid C string. */ + return NULL; + } + + s = malloc(len + 1); + if (s == NULL) { + return s; + } + + memcpy(s, val->start, len); + s[len] = '\0'; + + return s; +} + +struct spdk_json_num { + bool negative; + uint64_t significand; + int64_t exponent; +}; + +static int +spdk_json_number_split(const struct spdk_json_val *val, struct spdk_json_num *num) +{ + const char *iter; + size_t remaining; + uint64_t *pval; + uint64_t frac_digits = 0; + uint64_t exponent_u64 = 0; + bool exponent_negative = false; + enum { + NUM_STATE_INT, + NUM_STATE_FRAC, + NUM_STATE_EXP, + } state; + + memset(num, 0, sizeof(*num)); + + if (val->type != SPDK_JSON_VAL_NUMBER) { + return -EINVAL; + } + + remaining = val->len; + if (remaining == 0) { + return -EINVAL; + } + + iter = val->start; + if (*iter == '-') { + num->negative = true; + iter++; + remaining--; + } + + state = NUM_STATE_INT; + pval = &num->significand; + while (remaining--) { + char c = *iter++; + + if (c == '.') { + state = NUM_STATE_FRAC; + } else if (c == 'e' || c == 'E') { + state = NUM_STATE_EXP; + pval = &exponent_u64; + } else if (c == '-') { + assert(state == NUM_STATE_EXP); + exponent_negative = true; + } else if (c == '+') { + assert(state == NUM_STATE_EXP); + /* exp_negative = false; */ /* already false by default */ + } else { + uint64_t new_val; + + assert(c >= '0' && c <= '9'); + new_val = *pval * 10 + c - '0'; + if (new_val < *pval) { + return -ERANGE; + } + + if (state == NUM_STATE_FRAC) { + frac_digits++; + } + + *pval = new_val; + } + } + + if (exponent_negative) { + if (exponent_u64 > 9223372036854775808ULL) { /* abs(INT64_MIN) */ + return -ERANGE; + } + num->exponent = (int64_t) - exponent_u64; + } else { + if (exponent_u64 > INT64_MAX) { + return -ERANGE; + } + num->exponent = exponent_u64; + } + num->exponent -= frac_digits; + + /* Apply as much of the exponent as possible without overflow or truncation */ + if (num->exponent < 0) { + while (num->exponent && num->significand >= 10 && num->significand % 10 == 0) { + num->significand /= 10; + num->exponent++; + } + } else { /* positive exponent */ + while (num->exponent) { + uint64_t new_val = num->significand * 10; + + if (new_val < num->significand) { + break; + } + + num->significand = new_val; + num->exponent--; + } + } + + return 0; +} + +int +spdk_json_number_to_uint16(const struct spdk_json_val *val, uint16_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = spdk_json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + if (split_num.significand > UINT16_MAX) { + return -ERANGE; + } + *num = (uint16_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_int32(const struct spdk_json_val *val, int32_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = spdk_json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent) { + return -ERANGE; + } + + if (split_num.negative) { + if (split_num.significand > 2147483648) { /* abs(INT32_MIN) */ + return -ERANGE; + } + *num = (int32_t) - (int64_t)split_num.significand; + return 0; + } + + /* positive */ + if (split_num.significand > INT32_MAX) { + return -ERANGE; + } + *num = (int32_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_uint32(const struct spdk_json_val *val, uint32_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = spdk_json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + if (split_num.significand > UINT32_MAX) { + return -ERANGE; + } + *num = (uint32_t)split_num.significand; + return 0; +} + +int +spdk_json_number_to_uint64(const struct spdk_json_val *val, uint64_t *num) +{ + struct spdk_json_num split_num; + int rc; + + rc = spdk_json_number_split(val, &split_num); + if (rc) { + return rc; + } + + if (split_num.exponent || split_num.negative) { + return -ERANGE; + } + + *num = split_num.significand; + return 0; +} + +int +spdk_json_decode_object(const struct spdk_json_val *values, + const struct spdk_json_object_decoder *decoders, size_t num_decoders, void *out) +{ + uint32_t i; + bool invalid = false; + size_t decidx; + bool *seen; + + if (values == NULL || values->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + return -1; + } + + seen = calloc(sizeof(bool), num_decoders); + if (seen == NULL) { + return -1; + } + + for (i = 0; i < values->len;) { + const struct spdk_json_val *name = &values[i + 1]; + const struct spdk_json_val *v = &values[i + 2]; + bool found = false; + + for (decidx = 0; decidx < num_decoders; decidx++) { + const struct spdk_json_object_decoder *dec = &decoders[decidx]; + if (spdk_json_strequal(name, dec->name)) { + void *field = (void *)((uintptr_t)out + dec->offset); + + found = true; + + if (seen[decidx]) { + /* duplicate field name */ + invalid = true; + } else { + seen[decidx] = true; + if (dec->decode_func(v, field)) { + invalid = true; + /* keep going to fill out any other valid keys */ + } + } + break; + } + } + + if (!found) { + invalid = true; + } + + i += 1 + spdk_json_val_len(v); + } + + for (decidx = 0; decidx < num_decoders; decidx++) { + if (!decoders[decidx].optional && !seen[decidx]) { + /* required field is missing */ + invalid = true; + break; + } + } + + free(seen); + return invalid ? -1 : 0; +} + +int +spdk_json_decode_array(const struct spdk_json_val *values, spdk_json_decode_fn decode_func, + void *out, size_t max_size, size_t *out_size, size_t stride) +{ + uint32_t i; + char *field; + char *out_end; + + if (values == NULL || values->type != SPDK_JSON_VAL_ARRAY_BEGIN) { + return -1; + } + + *out_size = 0; + field = out; + out_end = field + max_size * stride; + for (i = 0; i < values->len;) { + const struct spdk_json_val *v = &values[i + 1]; + + if (field == out_end) { + return -1; + } + + if (decode_func(v, field)) { + return -1; + } + + i += spdk_json_val_len(v); + field += stride; + (*out_size)++; + } + + return 0; +} + +int +spdk_json_decode_bool(const struct spdk_json_val *val, void *out) +{ + bool *f = out; + + if (val->type != SPDK_JSON_VAL_TRUE && val->type != SPDK_JSON_VAL_FALSE) { + return -1; + } + + *f = val->type == SPDK_JSON_VAL_TRUE; + return 0; +} + +int +spdk_json_decode_uint16(const struct spdk_json_val *val, void *out) +{ + uint16_t *i = out; + + return spdk_json_number_to_uint16(val, i); +} + +int +spdk_json_decode_int32(const struct spdk_json_val *val, void *out) +{ + int32_t *i = out; + + return spdk_json_number_to_int32(val, i); +} + +int +spdk_json_decode_uint32(const struct spdk_json_val *val, void *out) +{ + uint32_t *i = out; + + return spdk_json_number_to_uint32(val, i); +} + +int +spdk_json_decode_uint64(const struct spdk_json_val *val, void *out) +{ + uint64_t *i = out; + + return spdk_json_number_to_uint64(val, i); +} + +int +spdk_json_decode_string(const struct spdk_json_val *val, void *out) +{ + char **s = out; + + free(*s); + + *s = spdk_json_strdup(val); + + if (*s) { + return 0; + } else { + return -1; + } +} + +static struct spdk_json_val * +spdk_json_first(struct spdk_json_val *object, enum spdk_json_val_type type) +{ + /* 'object' must be JSON object or array. 'type' might be combination of these two. */ + assert((type & (SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN)) != 0); + + assert(object != NULL); + + if ((object->type & type) == 0) { + return NULL; + } + + object++; + if (object->len == 0) { + return NULL; + } + + return object; +} + +static struct spdk_json_val * +spdk_json_value(struct spdk_json_val *key) +{ + return key->type == SPDK_JSON_VAL_NAME ? key + 1 : NULL; +} + +int +spdk_json_find(struct spdk_json_val *object, const char *key_name, struct spdk_json_val **key, + struct spdk_json_val **val, enum spdk_json_val_type type) +{ + struct spdk_json_val *_key = NULL; + struct spdk_json_val *_val = NULL; + struct spdk_json_val *it; + + assert(object != NULL); + + for (it = spdk_json_first(object, SPDK_JSON_VAL_ARRAY_BEGIN | SPDK_JSON_VAL_OBJECT_BEGIN); + it != NULL; + it = spdk_json_next(it)) { + if (it->type != SPDK_JSON_VAL_NAME) { + continue; + } + + if (spdk_json_strequal(it, key_name) != true) { + continue; + } + + if (_key) { + SPDK_JSON_DEBUG("Duplicate key '%s'", key_name); + return -EINVAL; + } + + _key = it; + _val = spdk_json_value(_key); + + if (type != SPDK_JSON_VAL_INVALID && (_val->type & type) == 0) { + SPDK_JSON_DEBUG("key '%s' type is %#x but expected one of %#x\n", key_name, _val->type, type); + return -EDOM; + } + } + + if (key) { + *key = _key; + } + + if (val) { + *val = _val; + } + + return _val ? 0 : -ENOENT; +} + +int +spdk_json_find_string(struct spdk_json_val *object, const char *key_name, + struct spdk_json_val **key, struct spdk_json_val **val) +{ + return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_STRING); +} + +int +spdk_json_find_array(struct spdk_json_val *object, const char *key_name, + struct spdk_json_val **key, struct spdk_json_val **val) +{ + return spdk_json_find(object, key_name, key, val, SPDK_JSON_VAL_ARRAY_BEGIN); +} + +struct spdk_json_val * +spdk_json_object_first(struct spdk_json_val *object) +{ + struct spdk_json_val *first = spdk_json_first(object, SPDK_JSON_VAL_OBJECT_BEGIN); + + /* Empty object? */ + return first && first->type != SPDK_JSON_VAL_OBJECT_END ? first : NULL; +} + +struct spdk_json_val * +spdk_json_array_first(struct spdk_json_val *array_begin) +{ + struct spdk_json_val *first = spdk_json_first(array_begin, SPDK_JSON_VAL_ARRAY_BEGIN); + + /* Empty array? */ + return first && first->type != SPDK_JSON_VAL_ARRAY_END ? first : NULL; +} + +static struct spdk_json_val * +spdk_json_skip_object_or_array(struct spdk_json_val *val) +{ + unsigned lvl; + enum spdk_json_val_type end_type; + struct spdk_json_val *it; + + if (val->type == SPDK_JSON_VAL_OBJECT_BEGIN) { + end_type = SPDK_JSON_VAL_OBJECT_END; + } else if (val->type == SPDK_JSON_VAL_ARRAY_BEGIN) { + end_type = SPDK_JSON_VAL_ARRAY_END; + } else { + SPDK_JSON_DEBUG("Expected JSON object (%#x) or array (%#x) but got %#x\n", + SPDK_JSON_VAL_OBJECT_BEGIN, SPDK_JSON_VAL_ARRAY_END, val->type); + return NULL; + } + + lvl = 1; + for (it = val + 1; it->type != SPDK_JSON_VAL_INVALID && lvl != 0; it++) { + if (it->type == val->type) { + lvl++; + } else if (it->type == end_type) { + lvl--; + } + } + + /* if lvl != 0 we have invalid JSON object */ + if (lvl != 0) { + SPDK_JSON_DEBUG("Can't find end of object (type: %#x): lvl (%u) != 0)\n", val->type, lvl); + it = NULL; + } + + return it; +} + +struct spdk_json_val * +spdk_json_next(struct spdk_json_val *it) +{ + struct spdk_json_val *val, *next; + + switch (it->type) { + case SPDK_JSON_VAL_NAME: + val = spdk_json_value(it); + next = spdk_json_next(val); + break; + + /* We are in the middle of an array - get to next entry */ + case SPDK_JSON_VAL_NULL: + case SPDK_JSON_VAL_TRUE: + case SPDK_JSON_VAL_FALSE: + case SPDK_JSON_VAL_NUMBER: + case SPDK_JSON_VAL_STRING: + val = it + 1; + return val; + + case SPDK_JSON_VAL_ARRAY_BEGIN: + case SPDK_JSON_VAL_OBJECT_BEGIN: + next = spdk_json_skip_object_or_array(it); + break; + + /* Can't go to the next object if started from the end of array or object */ + case SPDK_JSON_VAL_ARRAY_END: + case SPDK_JSON_VAL_OBJECT_END: + case SPDK_JSON_VAL_INVALID: + return NULL; + default: + assert(false); + return NULL; + + } + + /* EOF ? */ + if (next == NULL) { + return NULL; + } + + switch (next->type) { + case SPDK_JSON_VAL_ARRAY_END: + case SPDK_JSON_VAL_OBJECT_END: + case SPDK_JSON_VAL_INVALID: + return NULL; + default: + /* Next value */ + return next; + } +} + +SPDK_LOG_REGISTER_COMPONENT("json_util", SPDK_LOG_JSON_UTIL) diff --git a/src/spdk/lib/json/json_write.c b/src/spdk/lib/json/json_write.c new file mode 100644 index 00000000..0cd600be --- /dev/null +++ b/src/spdk/lib/json/json_write.c @@ -0,0 +1,687 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/json.h" + +#include "spdk_internal/utf.h" + +struct spdk_json_write_ctx { + spdk_json_write_cb write_cb; + void *cb_ctx; + uint32_t flags; + uint32_t indent; + bool new_indent; + bool first_value; + bool failed; + size_t buf_filled; + uint8_t buf[4096]; +}; + +static int emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size); + +static int +fail(struct spdk_json_write_ctx *w) +{ + w->failed = true; + return -1; +} + +static int +flush_buf(struct spdk_json_write_ctx *w) +{ + int rc; + + rc = w->write_cb(w->cb_ctx, w->buf, w->buf_filled); + if (rc != 0) { + return fail(w); + } + + w->buf_filled = 0; + + return 0; +} + +struct spdk_json_write_ctx * +spdk_json_write_begin(spdk_json_write_cb write_cb, void *cb_ctx, uint32_t flags) +{ + struct spdk_json_write_ctx *w; + + w = calloc(1, sizeof(*w)); + if (w == NULL) { + return w; + } + + w->write_cb = write_cb; + w->cb_ctx = cb_ctx; + w->flags = flags; + w->indent = 0; + w->new_indent = false; + w->first_value = true; + w->failed = false; + w->buf_filled = 0; + + return w; +} + +int +spdk_json_write_end(struct spdk_json_write_ctx *w) +{ + bool failed; + int rc; + + if (w == NULL) { + return 0; + } + + failed = w->failed; + + rc = flush_buf(w); + if (rc != 0) { + failed = true; + } + + free(w); + + return failed ? -1 : 0; +} + +static inline int +emit(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + size_t buf_remain = sizeof(w->buf) - w->buf_filled; + + if (spdk_unlikely(size > buf_remain)) { + /* Not enough space in buffer for the new data. */ + return emit_buf_full(w, data, size); + } + + /* Copy the new data into buf. */ + memcpy(w->buf + w->buf_filled, data, size); + w->buf_filled += size; + return 0; +} + +static int +emit_buf_full(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + size_t buf_remain = sizeof(w->buf) - w->buf_filled; + int rc; + + assert(size > buf_remain); + + /* Copy as much of the new data as possible into the buffer and flush it. */ + memcpy(w->buf + w->buf_filled, data, buf_remain); + w->buf_filled += buf_remain; + + rc = flush_buf(w); + if (rc != 0) { + return fail(w); + } + + /* Recurse to emit the rest of the data. */ + return emit(w, data + buf_remain, size - buf_remain); +} + +static int +emit_fmt(struct spdk_json_write_ctx *w, const void *data, size_t size) +{ + if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) { + return emit(w, data, size); + } + return 0; +} + +static int +emit_indent(struct spdk_json_write_ctx *w) +{ + uint32_t i; + + if (w->flags & SPDK_JSON_WRITE_FLAG_FORMATTED) { + for (i = 0; i < w->indent; i++) { + if (emit(w, " ", 2)) { return fail(w); } + } + } + return 0; +} + +static int +begin_value(struct spdk_json_write_ctx *w) +{ + // TODO: check for value state + if (w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + if (!w->first_value) { + if (emit(w, ",", 1)) { return fail(w); } + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->first_value = false; + w->new_indent = false; + return 0; +} + +int +spdk_json_write_val_raw(struct spdk_json_write_ctx *w, const void *data, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return emit(w, data, len); +} + +int +spdk_json_write_null(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + return emit(w, "null", 4); +} + +int +spdk_json_write_bool(struct spdk_json_write_ctx *w, bool val) +{ + if (begin_value(w)) { return fail(w); } + if (val) { + return emit(w, "true", 4); + } else { + return emit(w, "false", 5); + } +} + +int +spdk_json_write_int32(struct spdk_json_write_ctx *w, int32_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRId32, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_uint32(struct spdk_json_write_ctx *w, uint32_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRIu32, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_int64(struct spdk_json_write_ctx *w, int64_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRId64, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +int +spdk_json_write_uint64(struct spdk_json_write_ctx *w, uint64_t val) +{ + char buf[32]; + int count; + + if (begin_value(w)) { return fail(w); } + count = snprintf(buf, sizeof(buf), "%" PRIu64, val); + if (count <= 0 || (size_t)count >= sizeof(buf)) { return fail(w); } + return emit(w, buf, count); +} + +static void +write_hex_4(void *dest, uint16_t val) +{ + uint8_t *p = dest; + char hex[] = "0123456789ABCDEF"; + + p[0] = hex[(val >> 12)]; + p[1] = hex[(val >> 8) & 0xF]; + p[2] = hex[(val >> 4) & 0xF]; + p[3] = hex[val & 0xF]; +} + +static inline int +write_codepoint(struct spdk_json_write_ctx *w, uint32_t codepoint) +{ + static const uint8_t escapes[] = { + ['\b'] = 'b', + ['\f'] = 'f', + ['\n'] = 'n', + ['\r'] = 'r', + ['\t'] = 't', + ['"'] = '"', + ['\\'] = '\\', + /* + * Forward slash (/) is intentionally not converted to an escape + * (it is valid unescaped). + */ + }; + uint16_t high, low; + char out[13]; + size_t out_len; + + if (codepoint < sizeof(escapes) && escapes[codepoint]) { + out[0] = '\\'; + out[1] = escapes[codepoint]; + out_len = 2; + } else if (codepoint >= 0x20 && codepoint < 0x7F) { + /* + * Encode plain ASCII directly (except 0x7F, since it is really + * a control character, despite the JSON spec not considering it one). + */ + out[0] = (uint8_t)codepoint; + out_len = 1; + } else if (codepoint < 0x10000) { + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], (uint16_t)codepoint); + out_len = 6; + } else { + utf16_encode_surrogate_pair(codepoint, &high, &low); + out[0] = '\\'; + out[1] = 'u'; + write_hex_4(&out[2], high); + out[6] = '\\'; + out[7] = 'u'; + write_hex_4(&out[8], low); + out_len = 12; + } + + return emit(w, out, out_len); +} + +static int +write_string_or_name(struct spdk_json_write_ctx *w, const char *val, size_t len) +{ + const uint8_t *p = val; + const uint8_t *end = val + len; + + if (emit(w, "\"", 1)) { return fail(w); } + + while (p != end) { + int codepoint_len; + uint32_t codepoint; + + codepoint_len = utf8_valid(p, end); + switch (codepoint_len) { + case 1: + codepoint = utf8_decode_unsafe_1(p); + break; + case 2: + codepoint = utf8_decode_unsafe_2(p); + break; + case 3: + codepoint = utf8_decode_unsafe_3(p); + break; + case 4: + codepoint = utf8_decode_unsafe_4(p); + break; + default: + return fail(w); + } + + if (write_codepoint(w, codepoint)) { return fail(w); } + p += codepoint_len; + } + + return emit(w, "\"", 1); +} + +static int +write_string_or_name_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + const uint16_t *p = val; + const uint16_t *end = val + len; + + if (emit(w, "\"", 1)) { return fail(w); } + + while (p != end) { + int codepoint_len; + uint32_t codepoint; + + codepoint_len = utf16le_valid(p, end); + switch (codepoint_len) { + case 1: + codepoint = from_le16(&p[0]); + break; + case 2: + codepoint = utf16_decode_surrogate_pair(from_le16(&p[0]), from_le16(&p[1])); + break; + default: + return fail(w); + } + + if (write_codepoint(w, codepoint)) { return fail(w); } + p += codepoint_len; + } + + return emit(w, "\"", 1); +} + +int +spdk_json_write_string_raw(struct spdk_json_write_ctx *w, const char *val, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return write_string_or_name(w, val, len); +} + +int +spdk_json_write_string(struct spdk_json_write_ctx *w, const char *val) +{ + return spdk_json_write_string_raw(w, val, strlen(val)); +} + +int +spdk_json_write_string_utf16le_raw(struct spdk_json_write_ctx *w, const uint16_t *val, size_t len) +{ + if (begin_value(w)) { return fail(w); } + return write_string_or_name_utf16le(w, val, len); +} + +int +spdk_json_write_string_utf16le(struct spdk_json_write_ctx *w, const uint16_t *val) +{ + const uint16_t *p; + size_t len; + + for (len = 0, p = val; *p; p++) { + len++; + } + + return spdk_json_write_string_utf16le_raw(w, val, len); +} + +int +spdk_json_write_string_fmt(struct spdk_json_write_ctx *w, const char *fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = spdk_json_write_string_fmt_v(w, fmt, args); + va_end(args); + + return rc; +} + +int +spdk_json_write_string_fmt_v(struct spdk_json_write_ctx *w, const char *fmt, va_list args) +{ + char *s; + int rc; + + s = spdk_vsprintf_alloc(fmt, args); + if (s == NULL) { + return -1; + } + + rc = spdk_json_write_string(w, s); + free(s); + return rc; +} + +int +spdk_json_write_array_begin(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + w->first_value = true; + w->new_indent = true; + w->indent++; + if (emit(w, "[", 1)) { return fail(w); } + return 0; +} + +int +spdk_json_write_array_end(struct spdk_json_write_ctx *w) +{ + w->first_value = false; + if (w->indent == 0) { return fail(w); } + w->indent--; + if (!w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->new_indent = false; + return emit(w, "]", 1); +} + +int +spdk_json_write_object_begin(struct spdk_json_write_ctx *w) +{ + if (begin_value(w)) { return fail(w); } + w->first_value = true; + w->new_indent = true; + w->indent++; + if (emit(w, "{", 1)) { return fail(w); } + return 0; +} + +int +spdk_json_write_object_end(struct spdk_json_write_ctx *w) +{ + w->first_value = false; + w->indent--; + if (!w->new_indent) { + if (emit_fmt(w, "\n", 1)) { return fail(w); } + if (emit_indent(w)) { return fail(w); } + } + w->new_indent = false; + return emit(w, "}", 1); +} + +int +spdk_json_write_name_raw(struct spdk_json_write_ctx *w, const char *name, size_t len) +{ + /* TODO: check that container is an object */ + if (begin_value(w)) { return fail(w); } + if (write_string_or_name(w, name, len)) { return fail(w); } + w->first_value = true; + if (emit(w, ":", 1)) { return fail(w); } + return emit_fmt(w, " ", 1); +} + +int +spdk_json_write_name(struct spdk_json_write_ctx *w, const char *name) +{ + return spdk_json_write_name_raw(w, name, strlen(name)); +} + +int +spdk_json_write_val(struct spdk_json_write_ctx *w, const struct spdk_json_val *val) +{ + size_t num_values, i; + + switch (val->type) { + case SPDK_JSON_VAL_NUMBER: + return spdk_json_write_val_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_STRING: + return spdk_json_write_string_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_NAME: + return spdk_json_write_name_raw(w, val->start, val->len); + + case SPDK_JSON_VAL_TRUE: + return spdk_json_write_bool(w, true); + + case SPDK_JSON_VAL_FALSE: + return spdk_json_write_bool(w, false); + + case SPDK_JSON_VAL_NULL: + return spdk_json_write_null(w); + + case SPDK_JSON_VAL_ARRAY_BEGIN: + case SPDK_JSON_VAL_OBJECT_BEGIN: + num_values = val[0].len; + + if (val[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + if (spdk_json_write_object_begin(w)) { + return fail(w); + } + } else { + if (spdk_json_write_array_begin(w)) { + return fail(w); + } + } + + // Loop up to and including the _END value + for (i = 0; i < num_values + 1;) { + if (spdk_json_write_val(w, &val[i + 1])) { + return fail(w); + } + if (val[i + 1].type == SPDK_JSON_VAL_ARRAY_BEGIN || + val[i + 1].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + i += val[i + 1].len + 2; + } else { + i++; + } + } + return 0; + + case SPDK_JSON_VAL_ARRAY_END: + return spdk_json_write_array_end(w); + + case SPDK_JSON_VAL_OBJECT_END: + return spdk_json_write_object_end(w); + + case SPDK_JSON_VAL_INVALID: + // Handle INVALID to make the compiler happy (and catch other unhandled types) + return fail(w); + } + + return fail(w); +} + +int spdk_json_write_named_null(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + return rc ? rc : spdk_json_write_null(w); +} + +int spdk_json_write_named_bool(struct spdk_json_write_ctx *w, const char *name, bool val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_bool(w, val); +} + +int spdk_json_write_named_int32(struct spdk_json_write_ctx *w, const char *name, int32_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_int32(w, val); +} + +int spdk_json_write_named_uint32(struct spdk_json_write_ctx *w, const char *name, uint32_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_uint32(w, val); +} + +int spdk_json_write_named_uint64(struct spdk_json_write_ctx *w, const char *name, uint64_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_uint64(w, val); +} + +int spdk_json_write_named_int64(struct spdk_json_write_ctx *w, const char *name, int64_t val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_int64(w, val); +} + +int spdk_json_write_named_string(struct spdk_json_write_ctx *w, const char *name, const char *val) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_string(w, val); +} + +int spdk_json_write_named_string_fmt(struct spdk_json_write_ctx *w, const char *name, + const char *fmt, ...) +{ + va_list args; + int rc; + + va_start(args, fmt); + rc = spdk_json_write_named_string_fmt_v(w, name, fmt, args); + va_end(args); + + return rc; +} + +int spdk_json_write_named_string_fmt_v(struct spdk_json_write_ctx *w, const char *name, + const char *fmt, va_list args) +{ + char *s; + int rc; + + rc = spdk_json_write_name(w, name); + if (rc) { + return rc; + } + + s = spdk_vsprintf_alloc(fmt, args); + + if (s == NULL) { + return -1; + } + + rc = spdk_json_write_string(w, s); + free(s); + return rc; +} + +int spdk_json_write_named_array_begin(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_array_begin(w); +} + +int spdk_json_write_named_object_begin(struct spdk_json_write_ctx *w, const char *name) +{ + int rc = spdk_json_write_name(w, name); + + return rc ? rc : spdk_json_write_object_begin(w); +} diff --git a/src/spdk/lib/jsonrpc/Makefile b/src/spdk/lib/jsonrpc/Makefile new file mode 100644 index 00000000..dd323f1e --- /dev/null +++ b/src/spdk/lib/jsonrpc/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = jsonrpc +C_SRCS = jsonrpc_server.c jsonrpc_server_tcp.c +C_SRCS += jsonrpc_client.c jsonrpc_client_tcp.c + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client.c b/src/spdk/lib/jsonrpc/jsonrpc_client.c new file mode 100644 index 00000000..2426f4e3 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_client.c @@ -0,0 +1,213 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/util.h" +#include "jsonrpc_internal.h" + +struct jsonrpc_response { + const struct spdk_json_val *version; + const struct spdk_json_val *id; + const struct spdk_json_val *result; +}; + +static int +capture_string(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (spdk_json_strequal(val, "2.0") != true) { + return SPDK_JSON_PARSE_INVALID; + } + + *vptr = val; + return 0; +} + +static int +capture_id(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + if (val->type != SPDK_JSON_VAL_STRING && val->type != SPDK_JSON_VAL_NUMBER) { + return SPDK_JSON_PARSE_INVALID; + } + + *vptr = val; + return 0; +} + +static int +capture_any(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + *vptr = val; + return 0; +} + +static const struct spdk_json_object_decoder jsonrpc_response_decoders[] = { + {"jsonrpc", offsetof(struct jsonrpc_response, version), capture_string}, + {"id", offsetof(struct jsonrpc_response, id), capture_id}, + {"result", offsetof(struct jsonrpc_response, result), capture_any}, +}; + +static int +parse_single_response(struct spdk_json_val *values, + spdk_jsonrpc_client_response_parser parser_fn, + void *parser_ctx) +{ + struct jsonrpc_response resp = {}; + + if (spdk_json_decode_object(values, jsonrpc_response_decoders, + SPDK_COUNTOF(jsonrpc_response_decoders), + &resp)) { + return SPDK_JSON_PARSE_INVALID; + } + + return parser_fn(parser_ctx, resp.result); +} + +int +spdk_jsonrpc_parse_response(struct spdk_jsonrpc_client *client, void *json, size_t size) +{ + ssize_t rc; + void *end = NULL; + + /* Check to see if we have received a full JSON value. */ + rc = spdk_json_parse(json, size, NULL, 0, &end, 0); + if (rc == SPDK_JSON_PARSE_INCOMPLETE) { + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_RPC_CLIENT, "Json string is :\n%s\n", (char *)json); + if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_ERRLOG("JSON parse error\n"); + /* + * Can't recover from parse error (no guaranteed resync point in streaming JSON). + * Return an error to indicate that the connection should be closed. + */ + return SPDK_JSON_PARSE_INVALID; + } + + /* Decode a second time now that there is a full JSON value available. */ + rc = spdk_json_parse(json, size, client->values, SPDK_JSONRPC_MAX_VALUES, &end, + SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_ERRLOG("JSON parse error on second pass\n"); + return SPDK_JSON_PARSE_INVALID; + } + + assert(end != NULL); + + if (client->values[0].type != SPDK_JSON_VAL_OBJECT_BEGIN) { + SPDK_ERRLOG("top-level JSON value was not object\n"); + return SPDK_JSON_PARSE_INVALID; + } + + rc = parse_single_response(client->values, client->parser_fn, client->parser_ctx); + + return rc; +} + +static int +jsonrpc_client_write_cb(void *cb_ctx, const void *data, size_t size) +{ + struct spdk_jsonrpc_client_request *request = cb_ctx; + size_t new_size = request->send_buf_size; + + while (new_size - request->send_len < size) { + if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n", + (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX); + return -ENOSPC; + } + + new_size *= 2; + } + + if (new_size != request->send_buf_size) { + uint8_t *new_buf; + + new_buf = realloc(request->send_buf, new_size); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n", + request->send_buf_size, new_size); + return -ENOMEM; + } + + request->send_buf = new_buf; + request->send_buf_size = new_size; + } + + memcpy(request->send_buf + request->send_len, data, size); + request->send_len += size; + + return 0; +} + +struct spdk_json_write_ctx * +spdk_jsonrpc_begin_request(struct spdk_jsonrpc_client_request *request, int32_t id, + const char *method) +{ + struct spdk_json_write_ctx *w; + + w = spdk_json_write_begin(jsonrpc_client_write_cb, request, 0); + if (w == NULL) { + return NULL; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "jsonrpc", "2.0"); + + if (id >= 0) { + spdk_json_write_named_int32(w, "id", id); + } + + if (method) { + spdk_json_write_named_string(w, "method", method); + } + + return w; +} + +void +spdk_jsonrpc_end_request(struct spdk_jsonrpc_client_request *request, struct spdk_json_write_ctx *w) +{ + assert(w != NULL); + + spdk_json_write_object_end(w); + spdk_json_write_end(w); + jsonrpc_client_write_cb(request, "\n", 1); +} + +SPDK_LOG_REGISTER_COMPONENT("rpc_client", SPDK_LOG_RPC_CLIENT) diff --git a/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c new file mode 100644 index 00000000..a7696c84 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_client_tcp.c @@ -0,0 +1,284 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "spdk/string.h" +#include "jsonrpc_internal.h" + +#define RPC_DEFAULT_PORT "5260" + +static struct spdk_jsonrpc_client * +_spdk_jsonrpc_client_connect(int domain, int protocol, + struct sockaddr *server_addr, socklen_t addrlen) +{ + struct spdk_jsonrpc_client *client; + int rc; + + client = calloc(1, sizeof(struct spdk_jsonrpc_client)); + if (client == NULL) { + return NULL; + } + + client->sockfd = socket(domain, SOCK_STREAM, protocol); + if (client->sockfd < 0) { + SPDK_ERRLOG("socket() failed\n"); + free(client); + return NULL; + } + + rc = connect(client->sockfd, server_addr, addrlen); + if (rc != 0) { + SPDK_ERRLOG("could not connet JSON-RPC server: %s\n", spdk_strerror(errno)); + close(client->sockfd); + free(client); + return NULL; + } + + /* memory malloc for recv-buf */ + client->recv_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT); + if (!client->recv_buf) { + SPDK_ERRLOG("memory malloc for recv-buf failed\n"); + close(client->sockfd); + free(client); + return NULL; + } + client->recv_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + + return client; +} + +struct spdk_jsonrpc_client * +spdk_jsonrpc_client_connect(const char *rpc_sock_addr, int addr_family) +{ + struct spdk_jsonrpc_client *client; + + if (addr_family == AF_UNIX) { + /* Unix Domain Socket */ + struct sockaddr_un rpc_sock_addr_unix = {}; + int rc; + + rpc_sock_addr_unix.sun_family = AF_UNIX; + rc = snprintf(rpc_sock_addr_unix.sun_path, + sizeof(rpc_sock_addr_unix.sun_path), + "%s", rpc_sock_addr); + if (rc < 0 || (size_t)rc >= sizeof(rpc_sock_addr_unix.sun_path)) { + SPDK_ERRLOG("RPC Listen address Unix socket path too long\n"); + return NULL; + } + + client = _spdk_jsonrpc_client_connect(AF_UNIX, 0, + (struct sockaddr *)&rpc_sock_addr_unix, + sizeof(rpc_sock_addr_unix)); + } else { + /* TCP/IP socket */ + struct addrinfo hints; + struct addrinfo *res; + char *tmp; + char *host, *port; + + tmp = strdup(rpc_sock_addr); + if (!tmp) { + SPDK_ERRLOG("Out of memory\n"); + return NULL; + } + + if (spdk_parse_ip_addr(tmp, &host, &port) < 0) { + free(tmp); + SPDK_ERRLOG("Invalid listen address '%s'\n", rpc_sock_addr); + return NULL; + } + + if (port == NULL) { + port = RPC_DEFAULT_PORT; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + if (getaddrinfo(host, port, &hints, &res) != 0) { + free(tmp); + SPDK_ERRLOG("Unable to look up RPC connnect address '%s'\n", rpc_sock_addr); + return NULL; + } + + client = _spdk_jsonrpc_client_connect(res->ai_family, res->ai_protocol, + res->ai_addr, res->ai_addrlen); + + freeaddrinfo(res); + free(tmp); + } + + return client; +} + +void +spdk_jsonrpc_client_close(struct spdk_jsonrpc_client *client) +{ + if (client->sockfd >= 0) { + close(client->sockfd); + free(client->recv_buf); + client->sockfd = -1; + } + + free(client); +} + +struct spdk_jsonrpc_client_request * +spdk_jsonrpc_client_create_request(void) +{ + struct spdk_jsonrpc_client_request *request; + + request = calloc(1, sizeof(*request)); + if (request == NULL) { + return NULL; + } + + /* memory malloc for send-buf */ + request->send_buf = malloc(SPDK_JSONRPC_SEND_BUF_SIZE_INIT); + if (!request->send_buf) { + SPDK_ERRLOG("memory malloc for send-buf failed\n"); + free(request); + return NULL; + } + request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + + return request; +} + +void +spdk_jsonrpc_client_free_request(struct spdk_jsonrpc_client_request *req) +{ + free(req->send_buf); + free(req); +} + +int +spdk_jsonrpc_client_send_request(struct spdk_jsonrpc_client *client, + struct spdk_jsonrpc_client_request *request) +{ + ssize_t rc; + + /* Reset offset in request */ + request->send_offset = 0; + + while (request->send_len > 0) { + rc = send(client->sockfd, request->send_buf + request->send_offset, + request->send_len, 0); + if (rc <= 0) { + if (rc < 0 && errno == EINTR) { + rc = 0; + } else { + return rc; + } + } + + request->send_offset += rc; + request->send_len -= rc; + } + + return 0; +} + +static int +recv_buf_expand(struct spdk_jsonrpc_client *client) +{ + uint8_t *new_buf; + + if (client->recv_buf_size * 2 > SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + return -ENOSPC; + } + + new_buf = realloc(client->recv_buf, client->recv_buf_size * 2); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing recv_buf failed (current size %zu, new size %zu)\n", + client->recv_buf_size, client->recv_buf_size * 2); + return -ENOMEM; + } + + client->recv_buf = new_buf; + client->recv_buf_size *= 2; + + return 0; +} + +int +spdk_jsonrpc_client_recv_response(struct spdk_jsonrpc_client *client, + spdk_jsonrpc_client_response_parser parser_fn, + void *parser_ctx) +{ + ssize_t rc = 0; + size_t recv_avail; + size_t recv_offset = 0; + + client->parser_fn = parser_fn; + client->parser_ctx = parser_ctx; + + recv_avail = client->recv_buf_size; + + while (recv_avail > 0) { + rc = recv(client->sockfd, client->recv_buf + recv_offset, recv_avail, 0); + if (rc < 0) { + if (errno == EINTR) { + continue; + } else { + return errno; + } + } else if (rc == 0) { + return -EIO; + } + + recv_offset += rc; + recv_avail -= rc; + + /* Check to see if we have received a full JSON value. */ + rc = spdk_jsonrpc_parse_response(client, client->recv_buf, recv_offset); + if (rc == 0) { + /* Successfully parsed response */ + return 0; + } else if (rc != SPDK_JSON_PARSE_INCOMPLETE) { + SPDK_ERRLOG("jsonrpc parse request failed\n"); + return -EINVAL; + } + + /* Expand receive buffer if larger one is needed */ + if (recv_avail == 0) { + rc = recv_buf_expand(client); + if (rc != 0) { + return rc; + } + recv_avail = client->recv_buf_size - recv_offset; + } + } + + return 0; +} diff --git a/src/spdk/lib/jsonrpc/jsonrpc_internal.h b/src/spdk/lib/jsonrpc/jsonrpc_internal.h new file mode 100644 index 00000000..87355fdb --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_internal.h @@ -0,0 +1,149 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_JSONRPC_INTERNAL_H_ +#define SPDK_JSONRPC_INTERNAL_H_ + +#include "spdk/stdinc.h" + +#include "spdk/jsonrpc.h" + +#include "spdk_internal/log.h" + +#define SPDK_JSONRPC_RECV_BUF_SIZE (32 * 1024) +#define SPDK_JSONRPC_SEND_BUF_SIZE_INIT (32 * 1024) +#define SPDK_JSONRPC_SEND_BUF_SIZE_MAX (32 * 1024 * 1024) +#define SPDK_JSONRPC_ID_MAX_LEN 128 +#define SPDK_JSONRPC_MAX_CONNS 64 +#define SPDK_JSONRPC_MAX_VALUES 1024 + +struct spdk_jsonrpc_request { + struct spdk_jsonrpc_server_conn *conn; + + /* Copy of request id value */ + struct spdk_json_val id; + uint8_t id_data[SPDK_JSONRPC_ID_MAX_LEN]; + + /* Total space allocated for send_buf */ + size_t send_buf_size; + + /* Number of bytes used in send_buf (<= send_buf_size) */ + size_t send_len; + + size_t send_offset; + + uint8_t *send_buf; + + STAILQ_ENTRY(spdk_jsonrpc_request) link; +}; + +struct spdk_jsonrpc_server_conn { + struct spdk_jsonrpc_server *server; + int sockfd; + bool closed; + struct spdk_json_val values[SPDK_JSONRPC_MAX_VALUES]; + size_t recv_len; + uint8_t recv_buf[SPDK_JSONRPC_RECV_BUF_SIZE]; + uint32_t outstanding_requests; + + pthread_spinlock_t queue_lock; + STAILQ_HEAD(, spdk_jsonrpc_request) send_queue; + + struct spdk_jsonrpc_request *send_request; + + TAILQ_ENTRY(spdk_jsonrpc_server_conn) link; +}; + +struct spdk_jsonrpc_server { + int sockfd; + spdk_jsonrpc_handle_request_fn handle_request; + + TAILQ_HEAD(, spdk_jsonrpc_server_conn) free_conns; + TAILQ_HEAD(, spdk_jsonrpc_server_conn) conns; + + struct spdk_jsonrpc_server_conn conns_array[SPDK_JSONRPC_MAX_CONNS]; +}; + +struct spdk_jsonrpc_client_request { + /* Total space allocated for send_buf */ + size_t send_buf_size; + + /* Number of bytes used in send_buf (<= send_buf_size) */ + size_t send_len; + + size_t send_offset; + + uint8_t *send_buf; +}; + +struct spdk_jsonrpc_client { + int sockfd; + + struct spdk_json_val values[SPDK_JSONRPC_MAX_VALUES]; + size_t recv_buf_size; + uint8_t *recv_buf; + + spdk_jsonrpc_client_response_parser parser_fn; + void *parser_ctx; +}; + +/* jsonrpc_server_tcp */ +void spdk_jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, + const struct spdk_json_val *params); +void spdk_jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error); + +/* Might be called from any thread */ +void spdk_jsonrpc_server_send_response(struct spdk_jsonrpc_request *request); + +/* jsonrpc_server */ +int spdk_jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, void *json, size_t size); + +/* Must be called only from server poll thread */ +void spdk_jsonrpc_free_request(struct spdk_jsonrpc_request *request); + +/* + * Parse JSON data as RPC command response. + * + * \param client structure pointer of jsonrpc client + * \param json Raw JSON data; must be encoded in UTF-8. + * \param size Size of data in bytes. + * + * \return 0 On success + * SPDK_JSON_PARSE_INCOMPLETE If the provided data is not a complete JSON value + * SPDK_JSON_PARSE_INVALID if the provided data has invalid JSON syntax. + */ +int spdk_jsonrpc_parse_response(struct spdk_jsonrpc_client *client, void *json, + size_t size); + +#endif diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server.c b/src/spdk/lib/jsonrpc/jsonrpc_server.c new file mode 100644 index 00000000..6e2a5b2c --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_server.c @@ -0,0 +1,360 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "jsonrpc_internal.h" + +#include "spdk/util.h" + +struct jsonrpc_request { + const struct spdk_json_val *version; + const struct spdk_json_val *method; + const struct spdk_json_val *params; + const struct spdk_json_val *id; +}; + +static int +capture_val(const struct spdk_json_val *val, void *out) +{ + const struct spdk_json_val **vptr = out; + + *vptr = val; + return 0; +} + +static const struct spdk_json_object_decoder jsonrpc_request_decoders[] = { + {"jsonrpc", offsetof(struct jsonrpc_request, version), capture_val, true}, + {"method", offsetof(struct jsonrpc_request, method), capture_val}, + {"params", offsetof(struct jsonrpc_request, params), capture_val, true}, + {"id", offsetof(struct jsonrpc_request, id), capture_val, true}, +}; + +static void +parse_single_request(struct spdk_jsonrpc_request *request, struct spdk_json_val *values) +{ + bool invalid = false; + struct jsonrpc_request req = {}; + + if (spdk_json_decode_object(values, jsonrpc_request_decoders, + SPDK_COUNTOF(jsonrpc_request_decoders), + &req)) { + invalid = true; + goto done; + } + + if (req.version && (req.version->type != SPDK_JSON_VAL_STRING || + !spdk_json_strequal(req.version, "2.0"))) { + invalid = true; + } + + if (!req.method || req.method->type != SPDK_JSON_VAL_STRING) { + req.method = NULL; + invalid = true; + } + + if (req.id) { + if (req.id->type == SPDK_JSON_VAL_STRING || + req.id->type == SPDK_JSON_VAL_NUMBER) { + /* Copy value into request */ + if (req.id->len <= SPDK_JSONRPC_ID_MAX_LEN) { + request->id.type = req.id->type; + request->id.len = req.id->len; + memcpy(request->id.start, req.id->start, req.id->len); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON-RPC request id too long (%u)\n", + req.id->len); + invalid = true; + } + } else if (req.id->type == SPDK_JSON_VAL_NULL) { + request->id.type = SPDK_JSON_VAL_NULL; + } else { + invalid = true; + } + } + + if (req.params) { + if (req.params->type != SPDK_JSON_VAL_ARRAY_BEGIN && + req.params->type != SPDK_JSON_VAL_OBJECT_BEGIN) { + req.params = NULL; + invalid = true; + } + } + +done: + if (invalid) { + spdk_jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); + } else { + spdk_jsonrpc_server_handle_request(request, req.method, req.params); + } +} + +int +spdk_jsonrpc_parse_request(struct spdk_jsonrpc_server_conn *conn, void *json, size_t size) +{ + struct spdk_jsonrpc_request *request; + ssize_t rc; + void *end = NULL; + + /* Check to see if we have received a full JSON value. */ + rc = spdk_json_parse(json, size, NULL, 0, &end, 0); + if (rc == SPDK_JSON_PARSE_INCOMPLETE) { + return 0; + } + + request = calloc(1, sizeof(*request)); + if (request == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "Out of memory allocating request\n"); + return -1; + } + + conn->outstanding_requests++; + + request->conn = conn; + request->id.start = request->id_data; + request->id.len = 0; + request->id.type = SPDK_JSON_VAL_INVALID; + request->send_offset = 0; + request->send_len = 0; + request->send_buf_size = SPDK_JSONRPC_SEND_BUF_SIZE_INIT; + request->send_buf = malloc(request->send_buf_size); + if (request->send_buf == NULL) { + SPDK_ERRLOG("Failed to allocate send_buf (%zu bytes)\n", request->send_buf_size); + conn->outstanding_requests--; + free(request); + return -1; + } + + if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error\n"); + spdk_jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR); + + /* + * Can't recover from parse error (no guaranteed resync point in streaming JSON). + * Return an error to indicate that the connection should be closed. + */ + return -1; + } + + /* Decode a second time now that there is a full JSON value available. */ + rc = spdk_json_parse(json, size, conn->values, SPDK_JSONRPC_MAX_VALUES, &end, + SPDK_JSON_PARSE_FLAG_DECODE_IN_PLACE); + if (rc < 0 || rc > SPDK_JSONRPC_MAX_VALUES) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "JSON parse error on second pass\n"); + spdk_jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_PARSE_ERROR); + return -1; + } + + assert(end != NULL); + + if (conn->values[0].type == SPDK_JSON_VAL_OBJECT_BEGIN) { + parse_single_request(request, conn->values); + } else if (conn->values[0].type == SPDK_JSON_VAL_ARRAY_BEGIN) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "Got batch array (not currently supported)\n"); + spdk_jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "top-level JSON value was not array or object\n"); + spdk_jsonrpc_server_handle_error(request, SPDK_JSONRPC_ERROR_INVALID_REQUEST); + } + + return end - json; +} + +static int +spdk_jsonrpc_server_write_cb(void *cb_ctx, const void *data, size_t size) +{ + struct spdk_jsonrpc_request *request = cb_ctx; + size_t new_size = request->send_buf_size; + + while (new_size - request->send_len < size) { + if (new_size >= SPDK_JSONRPC_SEND_BUF_SIZE_MAX) { + SPDK_ERRLOG("Send buf exceeded maximum size (%zu)\n", + (size_t)SPDK_JSONRPC_SEND_BUF_SIZE_MAX); + return -1; + } + + new_size *= 2; + } + + if (new_size != request->send_buf_size) { + uint8_t *new_buf; + + new_buf = realloc(request->send_buf, new_size); + if (new_buf == NULL) { + SPDK_ERRLOG("Resizing send_buf failed (current size %zu, new size %zu)\n", + request->send_buf_size, new_size); + return -1; + } + + request->send_buf = new_buf; + request->send_buf_size = new_size; + } + + memcpy(request->send_buf + request->send_len, data, size); + request->send_len += size; + + return 0; +} + +static struct spdk_json_write_ctx * +begin_response(struct spdk_jsonrpc_request *request) +{ + struct spdk_json_write_ctx *w; + + w = spdk_json_write_begin(spdk_jsonrpc_server_write_cb, request, 0); + if (w == NULL) { + return NULL; + } + + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "jsonrpc"); + spdk_json_write_string(w, "2.0"); + + spdk_json_write_name(w, "id"); + spdk_json_write_val(w, &request->id); + + return w; +} + +static void +skip_response(struct spdk_jsonrpc_request *request) +{ + request->send_len = 0; + spdk_jsonrpc_server_send_response(request); +} + +static void +end_response(struct spdk_jsonrpc_request *request, struct spdk_json_write_ctx *w) +{ + spdk_json_write_object_end(w); + spdk_json_write_end(w); + spdk_jsonrpc_server_write_cb(request, "\n", 1); + spdk_jsonrpc_server_send_response(request); +} + +void +spdk_jsonrpc_free_request(struct spdk_jsonrpc_request *request) +{ + request->conn->outstanding_requests--; + free(request->send_buf); + free(request); +} + +struct spdk_json_write_ctx * +spdk_jsonrpc_begin_result(struct spdk_jsonrpc_request *request) +{ + struct spdk_json_write_ctx *w; + + if (request->id.type == SPDK_JSON_VAL_INVALID) { + /* Notification - no response required */ + skip_response(request); + return NULL; + } + + w = begin_response(request); + if (w == NULL) { + skip_response(request); + return NULL; + } + + spdk_json_write_name(w, "result"); + + return w; +} + +void +spdk_jsonrpc_end_result(struct spdk_jsonrpc_request *request, struct spdk_json_write_ctx *w) +{ + assert(w != NULL); + + end_response(request, w); +} + +void +spdk_jsonrpc_send_error_response(struct spdk_jsonrpc_request *request, + int error_code, const char *msg) +{ + struct spdk_json_write_ctx *w; + + if (request->id.type == SPDK_JSON_VAL_INVALID) { + /* For error responses, if id is missing, explicitly respond with "id": null. */ + request->id.type = SPDK_JSON_VAL_NULL; + } + + w = begin_response(request); + if (w == NULL) { + skip_response(request); + return; + } + + spdk_json_write_name(w, "error"); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "code"); + spdk_json_write_int32(w, error_code); + spdk_json_write_name(w, "message"); + spdk_json_write_string(w, msg); + spdk_json_write_object_end(w); + + end_response(request, w); +} + +void +spdk_jsonrpc_send_error_response_fmt(struct spdk_jsonrpc_request *request, + int error_code, const char *fmt, ...) +{ + struct spdk_json_write_ctx *w; + va_list args; + + if (request->id.type == SPDK_JSON_VAL_INVALID) { + /* For error responses, if id is missing, explicitly respond with "id": null. */ + request->id.type = SPDK_JSON_VAL_NULL; + } + + w = begin_response(request); + if (w == NULL) { + skip_response(request); + return; + } + + spdk_json_write_name(w, "error"); + spdk_json_write_object_begin(w); + spdk_json_write_name(w, "code"); + spdk_json_write_int32(w, error_code); + spdk_json_write_name(w, "message"); + va_start(args, fmt); + spdk_json_write_string_fmt_v(w, fmt, args); + va_end(args); + spdk_json_write_object_end(w); + + end_response(request, w); +} + +SPDK_LOG_REGISTER_COMPONENT("rpc", SPDK_LOG_RPC) diff --git a/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c new file mode 100644 index 00000000..c69d7483 --- /dev/null +++ b/src/spdk/lib/jsonrpc/jsonrpc_server_tcp.c @@ -0,0 +1,394 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "jsonrpc_internal.h" +#include "spdk/string.h" + +struct spdk_jsonrpc_server * +spdk_jsonrpc_server_listen(int domain, int protocol, + struct sockaddr *listen_addr, socklen_t addrlen, + spdk_jsonrpc_handle_request_fn handle_request) +{ + struct spdk_jsonrpc_server *server; + int rc, val, flag, i; + + server = calloc(1, sizeof(struct spdk_jsonrpc_server)); + if (server == NULL) { + return NULL; + } + + TAILQ_INIT(&server->free_conns); + TAILQ_INIT(&server->conns); + + for (i = 0; i < SPDK_JSONRPC_MAX_CONNS; i++) { + TAILQ_INSERT_TAIL(&server->free_conns, &server->conns_array[i], link); + } + + server->handle_request = handle_request; + + server->sockfd = socket(domain, SOCK_STREAM, protocol); + if (server->sockfd < 0) { + SPDK_ERRLOG("socket() failed\n"); + free(server); + return NULL; + } + + val = 1; + setsockopt(server->sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)); + if (protocol == IPPROTO_TCP) { + setsockopt(server->sockfd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); + } + + flag = fcntl(server->sockfd, F_GETFL); + if (fcntl(server->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + server->sockfd, spdk_strerror(errno)); + close(server->sockfd); + free(server); + return NULL; + } + + rc = bind(server->sockfd, listen_addr, addrlen); + if (rc != 0) { + SPDK_ERRLOG("could not bind JSON-RPC server: %s\n", spdk_strerror(errno)); + close(server->sockfd); + free(server); + return NULL; + } + + rc = listen(server->sockfd, 512); + if (rc != 0) { + SPDK_ERRLOG("listen() failed, errno = %d\n", errno); + close(server->sockfd); + free(server); + return NULL; + } + + return server; +} + +void +spdk_jsonrpc_server_shutdown(struct spdk_jsonrpc_server *server) +{ + struct spdk_jsonrpc_server_conn *conn; + + close(server->sockfd); + + TAILQ_FOREACH(conn, &server->conns, link) { + close(conn->sockfd); + } + + free(server); +} + +static void +spdk_jsonrpc_server_conn_close(struct spdk_jsonrpc_server_conn *conn) +{ + conn->closed = true; + + if (conn->sockfd >= 0) { + close(conn->sockfd); + conn->sockfd = -1; + } +} + +static void +spdk_jsonrpc_server_conn_remove(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_server *server = conn->server; + + spdk_jsonrpc_server_conn_close(conn); + + pthread_spin_destroy(&conn->queue_lock); + assert(STAILQ_EMPTY(&conn->send_queue)); + + TAILQ_REMOVE(&server->conns, conn, link); + TAILQ_INSERT_HEAD(&server->free_conns, conn, link); +} + +static int +spdk_jsonrpc_server_accept(struct spdk_jsonrpc_server *server) +{ + struct spdk_jsonrpc_server_conn *conn; + int rc, flag; + + rc = accept(server->sockfd, NULL, NULL); + if (rc >= 0) { + conn = TAILQ_FIRST(&server->free_conns); + assert(conn != NULL); + + conn->server = server; + conn->sockfd = rc; + conn->closed = false; + conn->recv_len = 0; + conn->outstanding_requests = 0; + pthread_spin_init(&conn->queue_lock, PTHREAD_PROCESS_PRIVATE); + STAILQ_INIT(&conn->send_queue); + conn->send_request = NULL; + + flag = fcntl(conn->sockfd, F_GETFL); + if (fcntl(conn->sockfd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + conn->sockfd, spdk_strerror(errno)); + close(conn->sockfd); + return -1; + } + + TAILQ_REMOVE(&server->free_conns, conn, link); + TAILQ_INSERT_TAIL(&server->conns, conn, link); + return 0; + } + + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + + return -1; +} + +void +spdk_jsonrpc_server_handle_request(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, const struct spdk_json_val *params) +{ + request->conn->server->handle_request(request, method, params); +} + +void +spdk_jsonrpc_server_handle_error(struct spdk_jsonrpc_request *request, int error) +{ + const char *msg; + + switch (error) { + case SPDK_JSONRPC_ERROR_PARSE_ERROR: + msg = "Parse error"; + break; + + case SPDK_JSONRPC_ERROR_INVALID_REQUEST: + msg = "Invalid request"; + break; + + case SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND: + msg = "Method not found"; + break; + + case SPDK_JSONRPC_ERROR_INVALID_PARAMS: + msg = "Invalid parameters"; + break; + + case SPDK_JSONRPC_ERROR_INTERNAL_ERROR: + msg = "Internal error"; + break; + + default: + msg = "Error"; + break; + } + + spdk_jsonrpc_send_error_response(request, error, msg); +} + +static int +spdk_jsonrpc_server_conn_recv(struct spdk_jsonrpc_server_conn *conn) +{ + ssize_t rc; + size_t recv_avail = SPDK_JSONRPC_RECV_BUF_SIZE - conn->recv_len; + + rc = recv(conn->sockfd, conn->recv_buf + conn->recv_len, recv_avail, 0); + if (rc == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + SPDK_DEBUGLOG(SPDK_LOG_RPC, "recv() failed: %s\n", spdk_strerror(errno)); + return -1; + } + + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "remote closed connection\n"); + return -1; + } + + conn->recv_len += rc; + + rc = spdk_jsonrpc_parse_request(conn, conn->recv_buf, conn->recv_len); + if (rc < 0) { + SPDK_ERRLOG("jsonrpc parse request failed\n"); + return -1; + } + + if (rc > 0) { + /* + * Successfully parsed a request - move any data past the end of the + * parsed request down to the beginning. + */ + assert((size_t)rc <= conn->recv_len); + memmove(conn->recv_buf, conn->recv_buf + rc, conn->recv_len - rc); + conn->recv_len -= rc; + } + + return 0; +} + +void +spdk_jsonrpc_server_send_response(struct spdk_jsonrpc_request *request) +{ + struct spdk_jsonrpc_server_conn *conn = request->conn; + + /* Queue the response to be sent */ + pthread_spin_lock(&conn->queue_lock); + STAILQ_INSERT_TAIL(&conn->send_queue, request, link); + pthread_spin_unlock(&conn->queue_lock); +} + +static struct spdk_jsonrpc_request * +spdk_jsonrpc_server_dequeue_request(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_request *request = NULL; + + pthread_spin_lock(&conn->queue_lock); + request = STAILQ_FIRST(&conn->send_queue); + if (request) { + STAILQ_REMOVE_HEAD(&conn->send_queue, link); + } + pthread_spin_unlock(&conn->queue_lock); + return request; +} + + +static int +spdk_jsonrpc_server_conn_send(struct spdk_jsonrpc_server_conn *conn) +{ + struct spdk_jsonrpc_request *request; + ssize_t rc; + +more: + if (conn->outstanding_requests == 0) { + return 0; + } + + if (conn->send_request == NULL) { + conn->send_request = spdk_jsonrpc_server_dequeue_request(conn); + } + + request = conn->send_request; + if (request == NULL) { + /* Nothing to send right now */ + return 0; + } + + if (request->send_len > 0) { + rc = send(conn->sockfd, request->send_buf + request->send_offset, + request->send_len, 0); + if (rc < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return 0; + } + + SPDK_DEBUGLOG(SPDK_LOG_RPC, "send() failed: %s\n", spdk_strerror(errno)); + return -1; + } + + request->send_offset += rc; + request->send_len -= rc; + } + + if (request->send_len == 0) { + /* + * Full response has been sent. + * Free it and set send_request to NULL to move on to the next queued response. + */ + conn->send_request = NULL; + spdk_jsonrpc_free_request(request); + goto more; + } + + return 0; +} + +int +spdk_jsonrpc_server_poll(struct spdk_jsonrpc_server *server) +{ + int rc; + struct spdk_jsonrpc_server_conn *conn, *conn_tmp; + + TAILQ_FOREACH_SAFE(conn, &server->conns, link, conn_tmp) { + if (conn->closed) { + struct spdk_jsonrpc_request *request; + + /* + * The client closed the connection, but there may still be requests + * outstanding; we have no way to cancel outstanding requests, so wait until + * each outstanding request sends a response (which will be discarded, since + * the connection is closed). + */ + + if (conn->send_request) { + spdk_jsonrpc_free_request(conn->send_request); + conn->send_request = NULL; + } + + while ((request = spdk_jsonrpc_server_dequeue_request(conn)) != NULL) { + spdk_jsonrpc_free_request(request); + } + + if (conn->outstanding_requests == 0) { + SPDK_DEBUGLOG(SPDK_LOG_RPC, "all outstanding requests completed\n"); + spdk_jsonrpc_server_conn_remove(conn); + } + } + } + + /* Check listen socket */ + if (!TAILQ_EMPTY(&server->free_conns)) { + spdk_jsonrpc_server_accept(server); + } + + TAILQ_FOREACH(conn, &server->conns, link) { + if (conn->closed) { + continue; + } + + rc = spdk_jsonrpc_server_conn_send(conn); + if (rc != 0) { + spdk_jsonrpc_server_conn_close(conn); + continue; + } + + rc = spdk_jsonrpc_server_conn_recv(conn); + if (rc != 0) { + spdk_jsonrpc_server_conn_close(conn); + continue; + } + } + + return 0; +} diff --git a/src/spdk/lib/log/Makefile b/src/spdk/lib/log/Makefile new file mode 100644 index 00000000..8125ebb1 --- /dev/null +++ b/src/spdk/lib/log/Makefile @@ -0,0 +1,45 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = log.c log_flags.c +LIBNAME = log +ifeq ($(CONFIG_LOG_BACKTRACE),y) +LOCAL_SYS_LIBS += -lunwind +endif + +DIRS-y = rpc + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/log/log.c b/src/spdk/lib/log/log.c new file mode 100644 index 00000000..9f4546ce --- /dev/null +++ b/src/spdk/lib/log/log.c @@ -0,0 +1,189 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" + +#ifdef SPDK_LOG_BACKTRACE_LVL +#define UNW_LOCAL_ONLY +#include +#endif + +static const char *const spdk_level_names[] = { + [SPDK_LOG_ERROR] = "ERROR", + [SPDK_LOG_WARN] = "WARNING", + [SPDK_LOG_NOTICE] = "NOTICE", + [SPDK_LOG_INFO] = "INFO", + [SPDK_LOG_DEBUG] = "DEBUG", +}; + +#define MAX_TMPBUF 1024 + +void +spdk_log_open(void) +{ + openlog("spdk", LOG_PID, LOG_LOCAL7); +} + +void +spdk_log_close(void) +{ + closelog(); +} + +#ifdef SPDK_LOG_BACKTRACE_LVL +static void +spdk_log_unwind_stack(FILE *fp, enum spdk_log_level level) +{ + unw_error_t err; + unw_cursor_t cursor; + unw_context_t uc; + unw_word_t ip; + unw_word_t offp; + char f_name[64]; + int frame; + + if (level > g_spdk_log_backtrace_level) { + return; + } + + unw_getcontext(&uc); + unw_init_local(&cursor, &uc); + fprintf(fp, "*%s*: === BACKTRACE START ===\n", spdk_level_names[level]); + + unw_step(&cursor); + for (frame = 1; unw_step(&cursor) > 0; frame++) { + unw_get_reg(&cursor, UNW_REG_IP, &ip); + err = unw_get_proc_name(&cursor, f_name, sizeof(f_name), &offp); + if (err || strcmp(f_name, "main") == 0) { + break; + } + + fprintf(fp, "*%s*: %3d: %*s%s() at %#lx\n", spdk_level_names[level], frame, frame - 1, "", f_name, + (unsigned long)ip); + } + fprintf(fp, "*%s*: === BACKTRACE END ===\n", spdk_level_names[level]); +} + +#else +#define spdk_log_unwind_stack(fp, lvl) +#endif + +void +spdk_log(enum spdk_log_level level, const char *file, const int line, const char *func, + const char *format, ...) +{ + int severity = LOG_INFO; + char buf[MAX_TMPBUF]; + va_list ap; + + switch (level) { + case SPDK_LOG_ERROR: + severity = LOG_ERR; + break; + case SPDK_LOG_WARN: + severity = LOG_WARNING; + break; + case SPDK_LOG_NOTICE: + severity = LOG_NOTICE; + break; + case SPDK_LOG_INFO: + case SPDK_LOG_DEBUG: + severity = LOG_INFO; + break; + case SPDK_LOG_DISABLED: + return; + } + + va_start(ap, format); + + vsnprintf(buf, sizeof(buf), format, ap); + + if (level <= g_spdk_log_print_level) { + fprintf(stderr, "%s:%4d:%s: *%s*: %s", file, line, func, spdk_level_names[level], buf); + spdk_log_unwind_stack(stderr, level); + } + + if (level <= g_spdk_log_level) { + syslog(severity, "%s:%4d:%s: *%s*: %s", file, line, func, spdk_level_names[level], buf); + } + + va_end(ap); +} + +static void +fdump(FILE *fp, const char *label, const uint8_t *buf, size_t len) +{ + char tmpbuf[MAX_TMPBUF]; + char buf16[16 + 1]; + size_t total; + unsigned int idx; + + fprintf(fp, "%s\n", label); + + memset(buf16, 0, sizeof buf16); + total = 0; + for (idx = 0; idx < len; idx++) { + if (idx != 0 && idx % 16 == 0) { + snprintf(tmpbuf + total, sizeof tmpbuf - total, + " %s", buf16); + fprintf(fp, "%s\n", tmpbuf); + total = 0; + } + if (idx % 16 == 0) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%08x ", idx); + } + if (idx % 8 == 0) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%s", " "); + } + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, + "%2.2x ", buf[idx] & 0xff); + buf16[idx % 16] = isprint(buf[idx]) ? buf[idx] : '.'; + } + for (; idx % 16 != 0; idx++) { + total += snprintf(tmpbuf + total, sizeof tmpbuf - total, " "); + buf16[idx % 16] = ' '; + } + snprintf(tmpbuf + total, sizeof tmpbuf - total, " %s", buf16); + fprintf(fp, "%s\n", tmpbuf); + fflush(fp); +} + +void +spdk_trace_dump(FILE *fp, const char *label, const void *buf, size_t len) +{ + fdump(fp, label, buf, len); +} diff --git a/src/spdk/lib/log/log_flags.c b/src/spdk/lib/log/log_flags.c new file mode 100644 index 00000000..1b766c44 --- /dev/null +++ b/src/spdk/lib/log/log_flags.c @@ -0,0 +1,196 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" + +static TAILQ_HEAD(, spdk_trace_flag) g_trace_flags = TAILQ_HEAD_INITIALIZER(g_trace_flags); + +enum spdk_log_level g_spdk_log_level = SPDK_LOG_NOTICE; +enum spdk_log_level g_spdk_log_print_level = SPDK_LOG_NOTICE; +enum spdk_log_level g_spdk_log_backtrace_level = SPDK_LOG_DISABLED; + +SPDK_LOG_REGISTER_COMPONENT("log", SPDK_LOG_LOG) + +#define MAX_TMPBUF 1024 + +void +spdk_log_set_level(enum spdk_log_level level) +{ + g_spdk_log_level = level; +} + +enum spdk_log_level +spdk_log_get_level(void) { + return g_spdk_log_level; +} + +void +spdk_log_set_print_level(enum spdk_log_level level) +{ + g_spdk_log_print_level = level; +} + +enum spdk_log_level +spdk_log_get_print_level(void) { + return g_spdk_log_print_level; +} + +void +spdk_log_set_backtrace_level(enum spdk_log_level level) +{ + g_spdk_log_backtrace_level = level; +} + +enum spdk_log_level +spdk_log_get_backtrace_level(void) { + return g_spdk_log_backtrace_level; +} + +static struct spdk_trace_flag * +get_trace_flag(const char *name) +{ + struct spdk_trace_flag *flag; + + TAILQ_FOREACH(flag, &g_trace_flags, tailq) { + if (strcasecmp(name, flag->name) == 0) { + return flag; + } + } + + return NULL; +} + +void +spdk_log_register_trace_flag(const char *name, struct spdk_trace_flag *flag) +{ + struct spdk_trace_flag *iter; + + if (name == NULL || flag == NULL) { + SPDK_ERRLOG("missing spdk_trace_flag parameters\n"); + assert(false); + return; + } + + if (get_trace_flag(name)) { + SPDK_ERRLOG("duplicate spdk_trace_flag '%s'\n", name); + assert(false); + return; + } + + TAILQ_FOREACH(iter, &g_trace_flags, tailq) { + if (strcasecmp(iter->name, flag->name) > 0) { + TAILQ_INSERT_BEFORE(iter, flag, tailq); + return; + } + } + + TAILQ_INSERT_TAIL(&g_trace_flags, flag, tailq); +} + +bool +spdk_log_get_trace_flag(const char *name) +{ + struct spdk_trace_flag *flag = get_trace_flag(name); + + if (flag && flag->enabled) { + return true; + } + + return false; +} + +static int +set_trace_flag(const char *name, bool value) +{ + struct spdk_trace_flag *flag; + + if (strcasecmp(name, "all") == 0) { + TAILQ_FOREACH(flag, &g_trace_flags, tailq) { + flag->enabled = value; + } + return 0; + } + + flag = get_trace_flag(name); + if (flag == NULL) { + return -1; + } + + flag->enabled = value; + + return 0; +} + +int +spdk_log_set_trace_flag(const char *name) +{ + return set_trace_flag(name, true); +} + +int +spdk_log_clear_trace_flag(const char *name) +{ + return set_trace_flag(name, false); +} + +struct spdk_trace_flag * +spdk_log_get_first_trace_flag(void) +{ + return TAILQ_FIRST(&g_trace_flags); +} + +struct spdk_trace_flag * +spdk_log_get_next_trace_flag(struct spdk_trace_flag *flag) +{ + return TAILQ_NEXT(flag, tailq); +} + +void +spdk_tracelog_usage(FILE *f, const char *trace_arg) +{ +#ifdef DEBUG + struct spdk_trace_flag *flag; + fprintf(f, " %s, --traceflag enable debug log flag (all", trace_arg); + + TAILQ_FOREACH(flag, &g_trace_flags, tailq) { + fprintf(f, ", %s", flag->name); + } + + fprintf(f, ")\n"); +#else + fprintf(f, " %s, --traceflag enable debug log flag (not supported" + " - must rebuild with --enable-debug)\n", trace_arg); +#endif +} diff --git a/src/spdk/lib/log/rpc/Makefile b/src/spdk/lib/log/rpc/Makefile new file mode 100644 index 00000000..bf53a64c --- /dev/null +++ b/src/spdk/lib/log/rpc/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = log_rpc.c +LIBNAME = log_rpc + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/log/rpc/log_rpc.c b/src/spdk/lib/log/rpc/log_rpc.c new file mode 100644 index 00000000..fea7607a --- /dev/null +++ b/src/spdk/lib/log/rpc/log_rpc.c @@ -0,0 +1,336 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct rpc_trace_flag { + char *flag; +}; + +struct rpc_log_level { + char *level; +}; + +static void +free_rpc_trace_flag(struct rpc_trace_flag *p) +{ + free(p->flag); +} + +static void +free_rpc_log_level(struct rpc_log_level *p) +{ + free(p->level); +} + +static const struct spdk_json_object_decoder rpc_trace_flag_decoders[] = { + {"flag", offsetof(struct rpc_trace_flag, flag), spdk_json_decode_string}, +}; + +static const struct spdk_json_object_decoder rpc_log_level_decoders[] = { + {"level", offsetof(struct rpc_log_level, level), spdk_json_decode_string}, +}; + +static int +_parse_log_level(char *level) +{ + if (!strcasecmp(level, "ERROR")) { + return SPDK_LOG_ERROR; + } else if (!strcasecmp(level, "WARNING")) { + return SPDK_LOG_WARN; + } else if (!strcasecmp(level, "NOTICE")) { + return SPDK_LOG_NOTICE; + } else if (!strcasecmp(level, "INFO")) { + return SPDK_LOG_INFO; + } else if (!strcasecmp(level, "DEBUG")) { + return SPDK_LOG_DEBUG; + } + return -1; +} + +static const char * +_get_log_level_name(int level) +{ + if (level == SPDK_LOG_ERROR) { + return "ERROR"; + } else if (level == SPDK_LOG_WARN) { + return "WARNING"; + } else if (level == SPDK_LOG_NOTICE) { + return "NOTICE"; + } else if (level == SPDK_LOG_INFO) { + return "INFO"; + } else if (level == SPDK_LOG_DEBUG) { + return "DEBUG"; + } + return NULL; +} + +static void +spdk_rpc_set_log_print_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_level req = {}; + int level; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_level_decoders, + SPDK_COUNTOF(rpc_log_level_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "spdk_json_decode_object failed\n"); + goto invalid; + } + + level = _parse_log_level(req.level); + if (level == -1) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "try to set invalid log level\n"); + goto invalid; + } + + spdk_log_set_print_level(level); + free_rpc_log_level(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_log_level(&req); +} +SPDK_RPC_REGISTER("set_log_print_level", spdk_rpc_set_log_print_level, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_log_print_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + int level; + const char *name; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_trace_flags requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + level = spdk_log_get_print_level(); + name = _get_log_level_name(level); + spdk_json_write_string(w, name); + + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_log_print_level", spdk_rpc_get_log_print_level, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_set_log_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_log_level req = {}; + int level; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_log_level_decoders, + SPDK_COUNTOF(rpc_log_level_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "spdk_json_decode_object failed\n"); + goto invalid; + } + + level = _parse_log_level(req.level); + if (level == -1) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "try to set invalid log level\n"); + goto invalid; + } + + + spdk_log_set_level(level); + free_rpc_log_level(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_log_level(&req); +} +SPDK_RPC_REGISTER("set_log_level", spdk_rpc_set_log_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_log_level(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + int level; + const char *name; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_trace_flags requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + level = spdk_log_get_level(); + name = _get_log_level_name(level); + spdk_json_write_string(w, name); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_log_level", spdk_rpc_get_log_level, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_set_trace_flag(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_trace_flag req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_trace_flag_decoders, + SPDK_COUNTOF(rpc_trace_flag_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.flag == 0) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "flag was 0\n"); + goto invalid; + } + + spdk_log_set_trace_flag(req.flag); + free_rpc_trace_flag(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_trace_flag(&req); +} +SPDK_RPC_REGISTER("set_trace_flag", spdk_rpc_set_trace_flag, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_clear_trace_flag(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_trace_flag req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_trace_flag_decoders, + SPDK_COUNTOF(rpc_trace_flag_decoders), &req)) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.flag == 0) { + SPDK_DEBUGLOG(SPDK_LOG_LOG, "flag was 0\n"); + goto invalid; + } + + spdk_log_clear_trace_flag(req.flag); + free_rpc_trace_flag(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_trace_flag(&req); +} +SPDK_RPC_REGISTER("clear_trace_flag", spdk_rpc_clear_trace_flag, + SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_trace_flags(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_trace_flag *flag; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_trace_flags requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_object_begin(w); + flag = spdk_log_get_first_trace_flag(); + while (flag) { + spdk_json_write_name(w, flag->name); + spdk_json_write_bool(w, flag->enabled); + flag = spdk_log_get_next_trace_flag(flag); + } + spdk_json_write_object_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_trace_flags", spdk_rpc_get_trace_flags, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/lvol/Makefile b/src/spdk/lib/lvol/Makefile new file mode 100644 index 00000000..49076fba --- /dev/null +++ b/src/spdk/lib/lvol/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = lvol.c +LIBNAME = lvol + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/lvol/lvol.c b/src/spdk/lib/lvol/lvol.c new file mode 100644 index 00000000..060cc89f --- /dev/null +++ b/src/spdk/lib/lvol/lvol.c @@ -0,0 +1,1494 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/lvolstore.h" +#include "spdk_internal/log.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/blob_bdev.h" +#include "spdk/util.h" + +/* Default blob channel opts for lvol */ +#define SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS 512 + +#define LVOL_NAME "name" + +SPDK_LOG_REGISTER_COMPONENT("lvol", SPDK_LOG_LVOL) + +static TAILQ_HEAD(, spdk_lvol_store) g_lvol_stores = TAILQ_HEAD_INITIALIZER(g_lvol_stores); +static pthread_mutex_t g_lvol_stores_mutex = PTHREAD_MUTEX_INITIALIZER; + +static inline uint64_t +divide_round_up(uint64_t num, uint64_t divisor) +{ + return (num + divisor - 1) / divisor; +} + +static int +_spdk_add_lvs_to_list(struct spdk_lvol_store *lvs) +{ + struct spdk_lvol_store *tmp; + bool name_conflict = false; + + pthread_mutex_lock(&g_lvol_stores_mutex); + TAILQ_FOREACH(tmp, &g_lvol_stores, link) { + if (!strncmp(lvs->name, tmp->name, SPDK_LVS_NAME_MAX)) { + name_conflict = true; + break; + } + } + if (!name_conflict) { + lvs->on_list = true; + TAILQ_INSERT_TAIL(&g_lvol_stores, lvs, link); + } + pthread_mutex_unlock(&g_lvol_stores_mutex); + + return name_conflict ? -1 : 0; +} + +static void +_spdk_lvs_free(struct spdk_lvol_store *lvs) +{ + if (lvs->on_list) { + TAILQ_REMOVE(&g_lvol_stores, lvs, link); + } + free(lvs); +} + +static void +_spdk_lvol_free(struct spdk_lvol *lvol) +{ + free(lvol->unique_id); + free(lvol); +} + +static void +_spdk_lvol_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Failed to open lvol %s\n", lvol->unique_id); + goto end; + } + + lvol->ref_count++; + lvol->blob = blob; +end: + req->cb_fn(req->cb_arg, lvol, lvolerrno); + free(req); +} + +void +spdk_lvol_open(struct spdk_lvol *lvol, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, NULL, -ENODEV); + return; + } + + if (lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot open lvol - operations on lvol pending\n"); + cb_fn(cb_arg, lvol, -EBUSY); + return; + } + + if (lvol->ref_count > 0) { + lvol->ref_count++; + cb_fn(cb_arg, lvol, 0); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + SPDK_ERRLOG("Cannot alloc memory for request structure\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_bs_open_blob(lvol->lvol_store->blobstore, lvol->blob_id, _spdk_lvol_open_cb, req); +} + +static void +_spdk_bs_unload_with_error_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + + req->cb_fn(req->cb_arg, NULL, req->lvserrno); + free(req); +} + +static void +_spdk_load_next_lvol(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + struct spdk_lvol *lvol, *tmp; + spdk_blob_id blob_id; + char uuid[SPDK_UUID_STRING_LEN]; + const char *attr; + size_t value_len; + int rc; + + if (lvolerrno == -ENOENT) { + /* Finished iterating */ + req->cb_fn(req->cb_arg, lvs, 0); + free(req); + return; + } else if (lvolerrno < 0) { + SPDK_ERRLOG("Failed to fetch blobs list\n"); + req->lvserrno = lvolerrno; + goto invalid; + } + + blob_id = spdk_blob_get_id(blob); + + if (blob_id == lvs->super_blob_id) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "found superblob %"PRIu64"\n", (uint64_t)blob_id); + spdk_bs_iter_next(bs, blob, _spdk_load_next_lvol, req); + return; + } + + lvol = calloc(1, sizeof(*lvol)); + if (!lvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + req->lvserrno = -ENOMEM; + goto invalid; + } + + lvol->blob = blob; + lvol->blob_id = blob_id; + lvol->lvol_store = lvs; + + rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len); + if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0' || + spdk_uuid_parse(&lvol->uuid, attr) != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Missing or corrupt lvol uuid\n"); + memset(&lvol->uuid, 0, sizeof(lvol->uuid)); + } + spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid); + + if (!spdk_mem_all_zero(&lvol->uuid, sizeof(lvol->uuid))) { + lvol->unique_id = strdup(lvol->uuid_str); + } else { + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvol->lvol_store->uuid); + lvol->unique_id = spdk_sprintf_alloc("%s_%"PRIu64, uuid, (uint64_t)blob_id); + } + if (!lvol->unique_id) { + SPDK_ERRLOG("Cannot assign lvol name\n"); + free(lvol); + req->lvserrno = -ENOMEM; + goto invalid; + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len); + if (rc != 0 || value_len > SPDK_LVOL_NAME_MAX) { + SPDK_ERRLOG("Cannot assign lvol name\n"); + _spdk_lvol_free(lvol); + req->lvserrno = -EINVAL; + goto invalid; + } + + snprintf(lvol->name, sizeof(lvol->name), "%s", attr); + + TAILQ_INSERT_TAIL(&lvs->lvols, lvol, link); + + lvs->lvol_count++; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "added lvol %s (%s)\n", lvol->unique_id, lvol->uuid_str); + + spdk_bs_iter_next(bs, blob, _spdk_load_next_lvol, req); + + return; + +invalid: + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + TAILQ_REMOVE(&lvs->lvols, lvol, link); + free(lvol->unique_id); + free(lvol); + } + + _spdk_lvs_free(lvs); + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); +} + +static void +_spdk_close_super_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not close super blob\n"); + _spdk_lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); + return; + } + + /* Start loading lvols */ + spdk_bs_iter_first(lvs->blobstore, _spdk_load_next_lvol, req); +} + +static void +_spdk_close_super_blob_with_error_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + _spdk_lvs_free(lvs); + + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); +} + +static void +_spdk_lvs_read_uuid(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + const char *attr; + size_t value_len; + int rc; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Could not open super blob\n"); + _spdk_lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "uuid", (const void **)&attr, &value_len); + if (rc != 0 || value_len != SPDK_UUID_STRING_LEN || attr[SPDK_UUID_STRING_LEN - 1] != '\0') { + SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or incorrect UUID\n"); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, _spdk_close_super_blob_with_error_cb, req); + return; + } + + if (spdk_uuid_parse(&lvs->uuid, attr)) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "incorrect UUID '%s'\n", attr); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, _spdk_close_super_blob_with_error_cb, req); + return; + } + + rc = spdk_blob_get_xattr_value(blob, "name", (const void **)&attr, &value_len); + if (rc != 0 || value_len > SPDK_LVS_NAME_MAX) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "missing or invalid name\n"); + req->lvserrno = -EINVAL; + spdk_blob_close(blob, _spdk_close_super_blob_with_error_cb, req); + return; + } + + snprintf(lvs->name, sizeof(lvs->name), "%s", attr); + + rc = _spdk_add_lvs_to_list(lvs); + if (rc) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "lvolstore with name %s already exists\n", lvs->name); + req->lvserrno = -EEXIST; + spdk_blob_close(blob, _spdk_close_super_blob_with_error_cb, req); + return; + } + + lvs->super_blob_id = spdk_blob_get_id(blob); + + spdk_blob_close(blob, _spdk_close_super_cb, req); +} + +static void +_spdk_lvs_open_super(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs = lvs->blobstore; + + if (lvolerrno != 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Super blob not found\n"); + _spdk_lvs_free(lvs); + req->lvserrno = -ENODEV; + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); + return; + } + + spdk_bs_open_blob(bs, blobid, _spdk_lvs_read_uuid, req); +} + +static void +_spdk_lvs_load_cb(void *cb_arg, struct spdk_blob_store *bs, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = (struct spdk_lvs_with_handle_req *)cb_arg; + struct spdk_lvol_store *lvs; + + if (lvolerrno != 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + lvs = calloc(1, sizeof(*lvs)); + if (lvs == NULL) { + SPDK_ERRLOG("Cannot alloc memory for lvol store\n"); + spdk_bs_unload(bs, _spdk_bs_unload_with_error_cb, req); + return; + } + + lvs->blobstore = bs; + lvs->bs_dev = req->bs_dev; + TAILQ_INIT(&lvs->lvols); + TAILQ_INIT(&lvs->pending_lvols); + + req->lvol_store = lvs; + + spdk_bs_get_super(bs, _spdk_lvs_open_super, req); +} + +static void +spdk_lvs_bs_opts_init(struct spdk_bs_opts *opts) +{ + spdk_bs_opts_init(opts); + opts->max_channel_ops = SPDK_LVOL_BLOB_OPTS_CHANNEL_OPS; +} + +void +spdk_lvs_load(struct spdk_bs_dev *bs_dev, spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvs_with_handle_req *req; + struct spdk_bs_opts opts = {}; + + assert(cb_fn != NULL); + + if (bs_dev == NULL) { + SPDK_ERRLOG("Blobstore device does not exist\n"); + cb_fn(cb_arg, NULL, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + SPDK_ERRLOG("Cannot alloc memory for request structure\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->bs_dev = bs_dev; + + spdk_lvs_bs_opts_init(&opts); + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE"); + + spdk_bs_load(bs_dev, &opts, _spdk_lvs_load_cb, req); +} + +static void +_spdk_super_create_close_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Lvol store init failed: could not close super blob\n"); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + _spdk_lvs_free(lvs); + free(req); + return; + } + + req->cb_fn(req->cb_arg, lvs, lvolerrno); + free(req); +} + +static void +_spdk_super_blob_set_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob *blob = lvs->super_blob; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not set uuid for super blob\n"); + _spdk_lvs_free(lvs); + free(req); + return; + } + + spdk_blob_close(blob, _spdk_super_create_close_cb, req); +} + +static void +_spdk_super_blob_init_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob *blob = lvs->super_blob; + char uuid[SPDK_UUID_STRING_LEN]; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not set super blob\n"); + _spdk_lvs_free(lvs); + free(req); + return; + } + + spdk_uuid_fmt_lower(uuid, sizeof(uuid), &lvs->uuid); + + spdk_blob_set_xattr(blob, "uuid", uuid, sizeof(uuid)); + spdk_blob_set_xattr(blob, "name", lvs->name, strnlen(lvs->name, SPDK_LVS_NAME_MAX) + 1); + spdk_blob_sync_md(blob, _spdk_super_blob_set_cb, req); +} + +static void +_spdk_super_blob_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not open super blob\n"); + _spdk_lvs_free(lvs); + free(req); + return; + } + + lvs->super_blob = blob; + lvs->super_blob_id = spdk_blob_get_id(blob); + + spdk_bs_set_super(lvs->blobstore, lvs->super_blob_id, _spdk_super_blob_init_cb, req); +} + +static void +_spdk_super_blob_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvs_with_handle_req *req = cb_arg; + struct spdk_lvol_store *lvs = req->lvol_store; + struct spdk_blob_store *bs; + + if (lvolerrno < 0) { + req->cb_fn(req->cb_arg, NULL, lvolerrno); + SPDK_ERRLOG("Lvol store init failed: could not create super blob\n"); + _spdk_lvs_free(lvs); + free(req); + return; + } + + bs = req->lvol_store->blobstore; + + spdk_bs_open_blob(bs, blobid, _spdk_super_blob_create_open_cb, req); +} + +static void +_spdk_lvs_init_cb(void *cb_arg, struct spdk_blob_store *bs, int lvserrno) +{ + struct spdk_lvs_with_handle_req *lvs_req = cb_arg; + struct spdk_lvol_store *lvs = lvs_req->lvol_store; + + if (lvserrno != 0) { + assert(bs == NULL); + lvs_req->cb_fn(lvs_req->cb_arg, NULL, lvserrno); + SPDK_ERRLOG("Lvol store init failed: could not initialize blobstore\n"); + _spdk_lvs_free(lvs); + free(lvs_req); + return; + } + + assert(bs != NULL); + lvs->blobstore = bs; + TAILQ_INIT(&lvs->lvols); + TAILQ_INIT(&lvs->pending_lvols); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store initialized\n"); + + /* create super blob */ + spdk_bs_create_blob(lvs->blobstore, _spdk_super_blob_create_cb, lvs_req); +} + +void +spdk_lvs_opts_init(struct spdk_lvs_opts *o) +{ + o->cluster_sz = SPDK_LVS_OPTS_CLUSTER_SZ; + memset(o->name, 0, sizeof(o->name)); +} + +static void +_spdk_setup_lvs_opts(struct spdk_bs_opts *bs_opts, struct spdk_lvs_opts *o) +{ + assert(o != NULL); + spdk_lvs_bs_opts_init(bs_opts); + bs_opts->cluster_sz = o->cluster_sz; +} + +int +spdk_lvs_init(struct spdk_bs_dev *bs_dev, struct spdk_lvs_opts *o, + spdk_lvs_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvs_with_handle_req *lvs_req; + struct spdk_bs_opts opts = {}; + int rc; + + if (bs_dev == NULL) { + SPDK_ERRLOG("Blobstore device does not exist\n"); + return -ENODEV; + } + + if (o == NULL) { + SPDK_ERRLOG("spdk_lvs_opts not specified\n"); + return -EINVAL; + } + + _spdk_setup_lvs_opts(&opts, o); + + if (strnlen(o->name, SPDK_LVS_NAME_MAX) == SPDK_LVS_NAME_MAX) { + SPDK_ERRLOG("Name has no null terminator.\n"); + return -EINVAL; + } + + if (strnlen(o->name, SPDK_LVS_NAME_MAX) == 0) { + SPDK_ERRLOG("No name specified.\n"); + return -EINVAL; + } + + lvs = calloc(1, sizeof(*lvs)); + if (!lvs) { + SPDK_ERRLOG("Cannot alloc memory for lvol store base pointer\n"); + return -ENOMEM; + } + + spdk_uuid_generate(&lvs->uuid); + snprintf(lvs->name, sizeof(lvs->name), "%s", o->name); + + rc = _spdk_add_lvs_to_list(lvs); + if (rc) { + SPDK_ERRLOG("lvolstore with name %s already exists\n", lvs->name); + _spdk_lvs_free(lvs); + return -EEXIST; + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + _spdk_lvs_free(lvs); + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + assert(cb_fn != NULL); + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + lvs_req->lvol_store = lvs; + lvs->bs_dev = bs_dev; + lvs->destruct = false; + + snprintf(opts.bstype.bstype, sizeof(opts.bstype.bstype), "LVOLSTORE"); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Initializing lvol store\n"); + spdk_bs_init(bs_dev, &opts, _spdk_lvs_init_cb, lvs_req); + + return 0; +} + +static void +_spdk_lvs_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + + if (lvolerrno != 0) { + req->lvserrno = lvolerrno; + } + if (req->lvserrno != 0) { + SPDK_ERRLOG("Lvol store rename operation failed\n"); + /* Lvs renaming failed, so we should 'clear' new_name. + * Otherwise it could cause a failure on the next attepmt to change the name to 'new_name' */ + snprintf(req->lvol_store->new_name, + sizeof(req->lvol_store->new_name), + "%s", req->lvol_store->name); + } else { + /* Update lvs name with new_name */ + snprintf(req->lvol_store->name, + sizeof(req->lvol_store->name), + "%s", req->lvol_store->new_name); + } + + req->cb_fn(req->cb_arg, req->lvserrno); + free(req); +} + +static void +_spdk_lvs_rename_sync_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + struct spdk_blob *blob = req->lvol_store->super_blob; + + if (lvolerrno < 0) { + req->lvserrno = lvolerrno; + } + + spdk_blob_close(blob, _spdk_lvs_rename_cb, req); +} + +static void +_spdk_lvs_rename_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvs_req *req = cb_arg; + int rc; + + if (lvolerrno < 0) { + _spdk_lvs_rename_cb(cb_arg, lvolerrno); + return; + } + + rc = spdk_blob_set_xattr(blob, "name", req->lvol_store->new_name, + strlen(req->lvol_store->new_name) + 1); + if (rc < 0) { + req->lvserrno = rc; + _spdk_lvs_rename_sync_cb(req, rc); + return; + } + + req->lvol_store->super_blob = blob; + + spdk_blob_sync_md(blob, _spdk_lvs_rename_sync_cb, req); +} + +void +spdk_lvs_rename(struct spdk_lvol_store *lvs, const char *new_name, + spdk_lvs_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvs_req *req; + struct spdk_lvol_store *tmp; + + /* Check if new name is current lvs name. + * If so, return success immediately */ + if (strncmp(lvs->name, new_name, SPDK_LVS_NAME_MAX) == 0) { + cb_fn(cb_arg, 0); + return; + } + + /* Check if new or new_name is already used in other lvs */ + TAILQ_FOREACH(tmp, &g_lvol_stores, link) { + if (!strncmp(new_name, tmp->name, SPDK_LVS_NAME_MAX) || + !strncmp(new_name, tmp->new_name, SPDK_LVS_NAME_MAX)) { + cb_fn(cb_arg, -EEXIST); + return; + } + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + snprintf(lvs->new_name, sizeof(lvs->new_name), "%s", new_name); + req->lvol_store = lvs; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_open_blob(lvs->blobstore, lvs->super_blob_id, _spdk_lvs_rename_open_cb, req); +} + +static void +_lvs_unload_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_req *lvs_req = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store unloaded\n"); + assert(lvs_req->cb_fn != NULL); + lvs_req->cb_fn(lvs_req->cb_arg, lvserrno); + free(lvs_req); +} + +int +spdk_lvs_unload(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvs_req *lvs_req; + struct spdk_lvol *lvol, *tmp; + + if (lvs == NULL) { + SPDK_ERRLOG("Lvol store is NULL\n"); + return -ENODEV; + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + if (lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot unload lvol store - operations on lvols pending\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } else if (lvol->ref_count != 0) { + SPDK_ERRLOG("Lvols still open on lvol store\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } + } + + TAILQ_FOREACH_SAFE(lvol, &lvs->lvols, link, tmp) { + TAILQ_REMOVE(&lvs->lvols, lvol, link); + _spdk_lvol_free(lvol); + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Unloading lvol store\n"); + spdk_bs_unload(lvs->blobstore, _lvs_unload_cb, lvs_req); + _spdk_lvs_free(lvs); + + return 0; +} + +static void +_lvs_destroy_cb(void *cb_arg, int lvserrno) +{ + struct spdk_lvs_destroy_req *lvs_req = cb_arg; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol store destroyed\n"); + assert(lvs_req->cb_fn != NULL); + lvs_req->cb_fn(lvs_req->cb_arg, lvserrno); + free(lvs_req); +} + +static void +_lvs_destroy_super_cb(void *cb_arg, int bserrno) +{ + struct spdk_lvs_destroy_req *lvs_req = cb_arg; + struct spdk_lvol_store *lvs = lvs_req->lvs; + + assert(lvs != NULL); + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Destroying lvol store\n"); + spdk_bs_destroy(lvs->blobstore, _lvs_destroy_cb, lvs_req); + _spdk_lvs_free(lvs); +} + +int +spdk_lvs_destroy(struct spdk_lvol_store *lvs, spdk_lvs_op_complete cb_fn, + void *cb_arg) +{ + struct spdk_lvs_destroy_req *lvs_req; + struct spdk_lvol *iter_lvol, *tmp; + + if (lvs == NULL) { + SPDK_ERRLOG("Lvol store is NULL\n"); + return -ENODEV; + } + + TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) { + if (iter_lvol->action_in_progress == true) { + SPDK_ERRLOG("Cannot destroy lvol store - operations on lvols pending\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } else if (iter_lvol->ref_count != 0) { + SPDK_ERRLOG("Lvols still open on lvol store\n"); + cb_fn(cb_arg, -EBUSY); + return -EBUSY; + } + } + + TAILQ_FOREACH_SAFE(iter_lvol, &lvs->lvols, link, tmp) { + free(iter_lvol->unique_id); + free(iter_lvol); + } + + lvs_req = calloc(1, sizeof(*lvs_req)); + if (!lvs_req) { + SPDK_ERRLOG("Cannot alloc memory for lvol store request pointer\n"); + return -ENOMEM; + } + + lvs_req->cb_fn = cb_fn; + lvs_req->cb_arg = cb_arg; + lvs_req->lvs = lvs; + + SPDK_INFOLOG(SPDK_LOG_LVOL, "Deleting super blob\n"); + spdk_bs_delete_blob(lvs->blobstore, lvs->super_blob_id, _lvs_destroy_super_cb, lvs_req); + + return 0; +} + +static void +_spdk_lvol_close_blob_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not close blob on lvol\n"); + _spdk_lvol_free(lvol); + goto end; + } + + lvol->ref_count--; + lvol->action_in_progress = false; + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s closed\n", lvol->unique_id); + +end: + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +bool +spdk_lvol_deletable(struct spdk_lvol *lvol) +{ + size_t count; + + spdk_blob_get_clones(lvol->lvol_store->blobstore, lvol->blob_id, NULL, &count); + return (count == 0); +} + +static void +_spdk_lvol_delete_blob_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not delete blob on lvol\n"); + goto end; + } + + TAILQ_REMOVE(&lvol->lvol_store->lvols, lvol, link); + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol %s deleted\n", lvol->unique_id); + +end: + _spdk_lvol_free(lvol); + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +static void +_spdk_lvol_destroy_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + struct spdk_blob_store *bs = lvol->lvol_store->blobstore; + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not close blob on lvol\n"); + _spdk_lvol_free(lvol); + req->cb_fn(req->cb_arg, lvolerrno); + free(req); + return; + } + SPDK_INFOLOG(SPDK_LOG_LVOL, "Blob closed on lvol %s\n", lvol->unique_id); + + spdk_bs_delete_blob(bs, lvol->blob_id, _spdk_lvol_delete_blob_cb, req); +} + +static void +_spdk_lvol_create_open_cb(void *cb_arg, struct spdk_blob *blob, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + spdk_blob_id blob_id = spdk_blob_get_id(blob); + struct spdk_lvol *lvol = req->lvol; + + TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link); + + if (lvolerrno < 0) { + free(lvol); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + lvol->blob = blob; + lvol->blob_id = blob_id; + + TAILQ_INSERT_TAIL(&lvol->lvol_store->lvols, lvol, link); + + lvol->unique_id = strdup(lvol->uuid_str); + if (!lvol->unique_id) { + SPDK_ERRLOG("Cannot alloc memory for lvol name\n"); + spdk_blob_close(blob, _spdk_lvol_destroy_cb, req); + return; + } + + lvol->ref_count++; + + assert(req->cb_fn != NULL); + req->cb_fn(req->cb_arg, req->lvol, lvolerrno); + free(req); +} + +static void +_spdk_lvol_create_cb(void *cb_arg, spdk_blob_id blobid, int lvolerrno) +{ + struct spdk_lvol_with_handle_req *req = cb_arg; + struct spdk_blob_store *bs; + + if (lvolerrno < 0) { + TAILQ_REMOVE(&req->lvol->lvol_store->pending_lvols, req->lvol, link); + free(req->lvol); + assert(req->cb_fn != NULL); + req->cb_fn(req->cb_arg, NULL, lvolerrno); + free(req); + return; + } + + bs = req->lvol->lvol_store->blobstore; + + spdk_bs_open_blob(bs, blobid, _spdk_lvol_create_open_cb, req); +} + +static void +spdk_lvol_get_xattr_value(void *xattr_ctx, const char *name, + const void **value, size_t *value_len) +{ + struct spdk_lvol *lvol = xattr_ctx; + + if (!strcmp(LVOL_NAME, name)) { + *value = lvol->name; + *value_len = SPDK_LVOL_NAME_MAX; + } else if (!strcmp("uuid", name)) { + *value = lvol->uuid_str; + *value_len = sizeof(lvol->uuid_str); + } +} + +static int +_spdk_lvs_verify_lvol_name(struct spdk_lvol_store *lvs, const char *name) +{ + struct spdk_lvol *tmp; + + if (name == NULL || strnlen(name, SPDK_LVOL_NAME_MAX) == 0) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "lvol name not provided.\n"); + return -EINVAL; + } + + if (strnlen(name, SPDK_LVOL_NAME_MAX) == SPDK_LVOL_NAME_MAX) { + SPDK_ERRLOG("Name has no null terminator.\n"); + return -EINVAL; + } + + TAILQ_FOREACH(tmp, &lvs->lvols, link) { + if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) { + SPDK_ERRLOG("lvol with name %s already exists\n", name); + return -EEXIST; + } + } + + TAILQ_FOREACH(tmp, &lvs->pending_lvols, link) { + if (!strncmp(name, tmp->name, SPDK_LVOL_NAME_MAX)) { + SPDK_ERRLOG("lvol with name %s is being already created\n", name); + return -EEXIST; + } + } + + return 0; +} + +int +spdk_lvol_create(struct spdk_lvol_store *lvs, const char *name, uint64_t sz, + bool thin_provision, spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_with_handle_req *req; + struct spdk_blob_store *bs; + struct spdk_lvol *lvol; + struct spdk_blob_opts opts; + uint64_t num_clusters; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + return -EINVAL; + } + + rc = _spdk_lvs_verify_lvol_name(lvs, name); + if (rc < 0) { + return rc; + } + + bs = lvs->blobstore; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + return -ENOMEM; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + lvol = calloc(1, sizeof(*lvol)); + if (!lvol) { + free(req); + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + return -ENOMEM; + } + lvol->lvol_store = lvs; + num_clusters = divide_round_up(sz, spdk_bs_get_cluster_size(bs)); + lvol->thin_provision = thin_provision; + snprintf(lvol->name, sizeof(lvol->name), "%s", name); + TAILQ_INSERT_TAIL(&lvol->lvol_store->pending_lvols, lvol, link); + spdk_uuid_generate(&lvol->uuid); + spdk_uuid_fmt_lower(lvol->uuid_str, sizeof(lvol->uuid_str), &lvol->uuid); + req->lvol = lvol; + + spdk_blob_opts_init(&opts); + opts.thin_provision = thin_provision; + opts.num_clusters = num_clusters; + opts.xattrs.count = SPDK_COUNTOF(xattr_names); + opts.xattrs.names = xattr_names; + opts.xattrs.ctx = lvol; + opts.xattrs.get_value = spdk_lvol_get_xattr_value; + + spdk_bs_create_blob_ext(lvs->blobstore, &opts, _spdk_lvol_create_cb, req); + + return 0; +} + +void +spdk_lvol_create_snapshot(struct spdk_lvol *origlvol, const char *snapshot_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_store *lvs; + struct spdk_lvol *newlvol; + struct spdk_blob *origblob; + struct spdk_lvol_with_handle_req *req; + struct spdk_blob_xattr_opts snapshot_xattrs; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (origlvol == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + origblob = origlvol->blob; + lvs = origlvol->lvol_store; + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = _spdk_lvs_verify_lvol_name(lvs, snapshot_name); + if (rc < 0) { + cb_fn(cb_arg, NULL, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol = calloc(1, sizeof(*newlvol)); + if (!newlvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + free(req); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol->lvol_store = origlvol->lvol_store; + snprintf(newlvol->name, sizeof(newlvol->name), "%s", snapshot_name); + TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link); + spdk_uuid_generate(&newlvol->uuid); + spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid); + snapshot_xattrs.count = SPDK_COUNTOF(xattr_names); + snapshot_xattrs.ctx = newlvol; + snapshot_xattrs.names = xattr_names; + snapshot_xattrs.get_value = spdk_lvol_get_xattr_value; + req->lvol = newlvol; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_create_snapshot(lvs->blobstore, spdk_blob_get_id(origblob), &snapshot_xattrs, + _spdk_lvol_create_cb, req); +} + +void +spdk_lvol_create_clone(struct spdk_lvol *origlvol, const char *clone_name, + spdk_lvol_op_with_handle_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol *newlvol; + struct spdk_lvol_with_handle_req *req; + struct spdk_lvol_store *lvs; + struct spdk_blob *origblob; + struct spdk_blob_xattr_opts clone_xattrs; + char *xattr_names[] = {LVOL_NAME, "uuid"}; + int rc; + + if (origlvol == NULL) { + SPDK_INFOLOG(SPDK_LOG_LVOL, "Lvol not provided.\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + origblob = origlvol->blob; + lvs = origlvol->lvol_store; + if (lvs == NULL) { + SPDK_ERRLOG("lvol store does not exist\n"); + cb_fn(cb_arg, NULL, -EINVAL); + return; + } + + rc = _spdk_lvs_verify_lvol_name(lvs, clone_name); + if (rc < 0) { + cb_fn(cb_arg, NULL, rc); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol = calloc(1, sizeof(*newlvol)); + if (!newlvol) { + SPDK_ERRLOG("Cannot alloc memory for lvol base pointer\n"); + free(req); + cb_fn(cb_arg, NULL, -ENOMEM); + return; + } + + newlvol->lvol_store = lvs; + snprintf(newlvol->name, sizeof(newlvol->name), "%s", clone_name); + TAILQ_INSERT_TAIL(&newlvol->lvol_store->pending_lvols, newlvol, link); + spdk_uuid_generate(&newlvol->uuid); + spdk_uuid_fmt_lower(newlvol->uuid_str, sizeof(newlvol->uuid_str), &newlvol->uuid); + clone_xattrs.count = SPDK_COUNTOF(xattr_names); + clone_xattrs.ctx = newlvol; + clone_xattrs.names = xattr_names; + clone_xattrs.get_value = spdk_lvol_get_xattr_value; + req->lvol = newlvol; + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + + spdk_bs_create_clone(lvs->blobstore, spdk_blob_get_id(origblob), &clone_xattrs, + _spdk_lvol_create_cb, + req); +} + +static void +_spdk_lvol_resize_done(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +static void +_spdk_lvol_blob_resize_cb(void *cb_arg, int bserrno) +{ + struct spdk_lvol_req *req = cb_arg; + struct spdk_lvol *lvol = req->lvol; + + if (bserrno != 0) { + req->cb_fn(req->cb_arg, bserrno); + free(req); + return; + } + + spdk_blob_sync_md(lvol->blob, _spdk_lvol_resize_done, req); +} + +void +spdk_lvol_resize(struct spdk_lvol *lvol, uint64_t sz, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_blob *blob = lvol->blob; + struct spdk_lvol_store *lvs = lvol->lvol_store; + struct spdk_lvol_req *req; + uint64_t new_clusters = divide_round_up(sz, spdk_bs_get_cluster_size(lvs->blobstore)); + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_blob_resize(blob, new_clusters, _spdk_lvol_blob_resize_cb, req); +} + +static void +_spdk_lvol_rename_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + if (lvolerrno != 0) { + SPDK_ERRLOG("Lvol rename operation failed\n"); + } else { + snprintf(req->lvol->name, sizeof(req->lvol->name), "%s", req->name); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +spdk_lvol_rename(struct spdk_lvol *lvol, const char *new_name, + spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol *tmp; + struct spdk_blob *blob = lvol->blob; + struct spdk_lvol_req *req; + int rc; + + /* Check if new name is current lvol name. + * If so, return success immediately */ + if (strncmp(lvol->name, new_name, SPDK_LVOL_NAME_MAX) == 0) { + cb_fn(cb_arg, 0); + return; + } + + /* Check if lvol with 'new_name' already exists in lvolstore */ + TAILQ_FOREACH(tmp, &lvol->lvol_store->lvols, link) { + if (strncmp(tmp->name, new_name, SPDK_LVOL_NAME_MAX) == 0) { + SPDK_ERRLOG("Lvol %s already exists in lvol store %s\n", new_name, lvol->lvol_store->name); + cb_fn(cb_arg, -EEXIST); + return; + } + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + snprintf(req->name, sizeof(req->name), "%s", new_name); + + rc = spdk_blob_set_xattr(blob, "name", new_name, strlen(new_name) + 1); + if (rc < 0) { + free(req); + cb_fn(cb_arg, rc); + return; + } + + spdk_blob_sync_md(blob, _spdk_lvol_rename_cb, req); +} + +void +spdk_lvol_destroy(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + struct spdk_blob_store *bs = lvol->lvol_store->blobstore; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + if (lvol->ref_count != 0) { + SPDK_ERRLOG("Cannot destroy lvol %s because it is still open\n", lvol->unique_id); + cb_fn(cb_arg, -EBUSY); + return; + } + + lvol->action_in_progress = true; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_bs_delete_blob(bs, lvol->blob_id, _spdk_lvol_delete_blob_cb, req); +} + +void +spdk_lvol_close(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + if (lvol->ref_count > 1) { + lvol->ref_count--; + cb_fn(cb_arg, 0); + return; + } else if (lvol->ref_count == 0) { + cb_fn(cb_arg, -EINVAL); + return; + } + + lvol->action_in_progress = true; + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->lvol = lvol; + + spdk_blob_close(lvol->blob, _spdk_lvol_close_blob_cb, req); +} + +struct spdk_io_channel * +spdk_lvol_get_io_channel(struct spdk_lvol *lvol) +{ + return spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); +} + +static void +_spdk_lvol_inflate_cb(void *cb_arg, int lvolerrno) +{ + struct spdk_lvol_req *req = cb_arg; + + spdk_bs_free_io_channel(req->channel); + + if (lvolerrno < 0) { + SPDK_ERRLOG("Could not inflate lvol\n"); + } + + req->cb_fn(req->cb_arg, lvolerrno); + free(req); +} + +void +spdk_lvol_inflate(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + struct spdk_blob *blob = lvol->blob; + spdk_blob_id blob_id = spdk_blob_get_id(blob); + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("Lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); + if (req->channel == NULL) { + SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n"); + free(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + spdk_bs_inflate_blob(lvol->lvol_store->blobstore, req->channel, blob_id, _spdk_lvol_inflate_cb, + req); +} + +void +spdk_lvol_decouple_parent(struct spdk_lvol *lvol, spdk_lvol_op_complete cb_fn, void *cb_arg) +{ + struct spdk_lvol_req *req; + struct spdk_blob *blob = lvol->blob; + spdk_blob_id blob_id = spdk_blob_get_id(blob); + + assert(cb_fn != NULL); + + if (lvol == NULL) { + SPDK_ERRLOG("Lvol does not exist\n"); + cb_fn(cb_arg, -ENODEV); + return; + } + + req = calloc(1, sizeof(*req)); + if (!req) { + SPDK_ERRLOG("Cannot alloc memory for lvol request pointer\n"); + cb_fn(cb_arg, -ENOMEM); + return; + } + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->channel = spdk_bs_alloc_io_channel(lvol->lvol_store->blobstore); + if (req->channel == NULL) { + SPDK_ERRLOG("Cannot alloc io channel for lvol inflate request\n"); + free(req); + cb_fn(cb_arg, -ENOMEM); + return; + } + + spdk_bs_blob_decouple_parent(lvol->lvol_store->blobstore, req->channel, blob_id, + _spdk_lvol_inflate_cb, req); +} diff --git a/src/spdk/lib/nbd/Makefile b/src/spdk/lib/nbd/Makefile new file mode 100644 index 00000000..419a2158 --- /dev/null +++ b/src/spdk/lib/nbd/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = nbd +C_SRCS = nbd.c nbd_rpc.c + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nbd/nbd.c b/src/spdk/lib/nbd/nbd.c new file mode 100644 index 00000000..639f122c --- /dev/null +++ b/src/spdk/lib/nbd/nbd.c @@ -0,0 +1,969 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include + +#include "spdk/nbd.h" +#include "nbd_internal.h" +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/util.h" +#include "spdk/thread.h" +#include "spdk/event.h" + +#include "spdk_internal/log.h" +#include "spdk/queue.h" + +#define GET_IO_LOOP_COUNT 16 + +enum nbd_io_state_t { + /* Receiving or ready to receive nbd request header */ + NBD_IO_RECV_REQ = 0, + /* Receiving write payload */ + NBD_IO_RECV_PAYLOAD, + /* Transmitting or ready to transmit nbd response header */ + NBD_IO_XMIT_RESP, + /* Transmitting read payload */ + NBD_IO_XMIT_PAYLOAD, +}; + +struct nbd_io { + struct spdk_nbd_disk *nbd; + enum nbd_io_state_t state; + + void *payload; + uint32_t payload_size; + + struct nbd_request req; + struct nbd_reply resp; + + /* + * Tracks current progress on reading/writing a request, + * response, or payload from the nbd socket. + */ + uint32_t offset; + + /* for bdev io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + TAILQ_ENTRY(nbd_io) tailq; +}; + +enum nbd_disk_state_t { + NBD_DISK_STATE_RUNNING = 0, + /* soft disconnection caused by receiving nbd_cmd_disc */ + NBD_DISK_STATE_SOFTDISC, + /* hard disconnection caused by mandatory conditions */ + NBD_DISK_STATE_HARDDISC, +}; + +struct spdk_nbd_disk { + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *ch; + int dev_fd; + char *nbd_path; + int kernel_sp_fd; + int spdk_sp_fd; + struct spdk_poller *nbd_poller; + uint32_t buf_align; + + struct nbd_io *io_in_recv; + TAILQ_HEAD(, nbd_io) received_io_list; + TAILQ_HEAD(, nbd_io) executed_io_list; + + enum nbd_disk_state_t state; + /* count of nbd_io in spdk_nbd_disk */ + int io_count; + + TAILQ_ENTRY(spdk_nbd_disk) tailq; +}; + +struct spdk_nbd_disk_globals { + TAILQ_HEAD(, spdk_nbd_disk) disk_head; +}; + +static struct spdk_nbd_disk_globals g_spdk_nbd; + +static int +nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io); + +int +spdk_nbd_init(void) +{ + TAILQ_INIT(&g_spdk_nbd.disk_head); + + return 0; +} + +void +spdk_nbd_fini(void) +{ + struct spdk_nbd_disk *nbd_idx, *nbd_tmp; + + /* + * Stop running spdk_nbd_disk. + * Here, nbd removing are unnecessary, but _SAFE variant + * is needed, since internal spdk_nbd_disk_unregister will + * remove nbd from TAILQ. + */ + TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) { + spdk_nbd_stop(nbd_idx); + } +} + +static int +spdk_nbd_disk_register(struct spdk_nbd_disk *nbd) +{ + if (spdk_nbd_disk_find_by_nbd_path(nbd->nbd_path)) { + SPDK_NOTICELOG("%s is already exported\n", nbd->nbd_path); + return -1; + } + + TAILQ_INSERT_TAIL(&g_spdk_nbd.disk_head, nbd, tailq); + + return 0; +} + +static void +spdk_nbd_disk_unregister(struct spdk_nbd_disk *nbd) +{ + struct spdk_nbd_disk *nbd_idx, *nbd_tmp; + + /* + * nbd disk may be stopped before registered. + * check whether it was registered. + */ + TAILQ_FOREACH_SAFE(nbd_idx, &g_spdk_nbd.disk_head, tailq, nbd_tmp) { + if (nbd == nbd_idx) { + TAILQ_REMOVE(&g_spdk_nbd.disk_head, nbd_idx, tailq); + break; + } + } +} + +struct spdk_nbd_disk * +spdk_nbd_disk_find_by_nbd_path(const char *nbd_path) +{ + struct spdk_nbd_disk *nbd; + + /* + * check whether nbd has already been registered by nbd path. + */ + TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) { + if (!strcmp(nbd->nbd_path, nbd_path)) { + return nbd; + } + } + + return NULL; +} + +struct spdk_nbd_disk *spdk_nbd_disk_first(void) +{ + return TAILQ_FIRST(&g_spdk_nbd.disk_head); +} + +struct spdk_nbd_disk *spdk_nbd_disk_next(struct spdk_nbd_disk *prev) +{ + return TAILQ_NEXT(prev, tailq); +} + +const char * +spdk_nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd) +{ + return nbd->nbd_path; +} + +const char * +spdk_nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd) +{ + return spdk_bdev_get_name(nbd->bdev); +} + +void +spdk_nbd_write_config_json(struct spdk_json_write_ctx *w) +{ + struct spdk_nbd_disk *nbd; + + spdk_json_write_array_begin(w); + + TAILQ_FOREACH(nbd, &g_spdk_nbd.disk_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "method", "start_nbd_disk"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "nbd_device", spdk_nbd_disk_get_nbd_path(nbd)); + spdk_json_write_named_string(w, "bdev_name", spdk_nbd_disk_get_bdev_name(nbd)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +void +nbd_disconnect(struct spdk_nbd_disk *nbd) +{ + /* + * nbd soft-disconnection to terminate transmission phase. + * After receiving this ioctl command, nbd kernel module will send + * a NBD_CMD_DISC type io to nbd server in order to inform server. + */ + ioctl(nbd->dev_fd, NBD_DISCONNECT); +} + +static struct nbd_io * +spdk_get_nbd_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + + io = calloc(1, sizeof(*io)); + if (!io) { + return NULL; + } + + io->nbd = nbd; + to_be32(&io->resp.magic, NBD_REPLY_MAGIC); + + nbd->io_count++; + + return io; +} + +static void +spdk_put_nbd_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) +{ + if (io->payload) { + spdk_dma_free(io->payload); + } + free(io); + + nbd->io_count--; +} + +/* + * Check whether received nbd_io are all transmitted. + * + * \return 1 there is still some nbd_io not transmitted. + * 0 all nbd_io received are transmitted. + */ +static int +spdk_nbd_io_xmit_check(struct spdk_nbd_disk *nbd) +{ + if (nbd->io_count == 0) { + return 0; + } else if (nbd->io_count == 1 && nbd->io_in_recv != NULL) { + return 0; + } + + return 1; +} + +/* + * Check whether received nbd_io are all executed, + * and put back executed nbd_io instead of transmitting them + * + * \return 1 there is still some nbd_io under executing + * 0 all nbd_io gotten are freed. + */ +static int +spdk_nbd_cleanup_io(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io, *io_tmp; + + /* free io_in_recv */ + if (nbd->io_in_recv != NULL) { + spdk_put_nbd_io(nbd, nbd->io_in_recv); + nbd->io_in_recv = NULL; + } + + /* free io in received_io_list */ + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); + } + } + + /* free io in executed_io_list */ + if (!TAILQ_EMPTY(&nbd->executed_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->executed_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + spdk_put_nbd_io(nbd, io); + } + } + + /* + * Some nbd_io may be under executing in bdev. + * Wait for their done operation. + */ + if (nbd->io_count != 0) { + return 1; + } + + return 0; +} + +static void +_nbd_stop(struct spdk_nbd_disk *nbd) +{ + if (nbd->ch) { + spdk_put_io_channel(nbd->ch); + } + + if (nbd->bdev_desc) { + spdk_bdev_close(nbd->bdev_desc); + } + + if (nbd->nbd_path) { + free(nbd->nbd_path); + } + + if (nbd->spdk_sp_fd >= 0) { + close(nbd->spdk_sp_fd); + } + + if (nbd->kernel_sp_fd >= 0) { + close(nbd->kernel_sp_fd); + } + + if (nbd->dev_fd >= 0) { + ioctl(nbd->dev_fd, NBD_CLEAR_QUE); + ioctl(nbd->dev_fd, NBD_CLEAR_SOCK); + close(nbd->dev_fd); + } + + if (nbd->nbd_poller) { + spdk_poller_unregister(&nbd->nbd_poller); + } + + spdk_nbd_disk_unregister(nbd); + + free(nbd); +} + +void +spdk_nbd_stop(struct spdk_nbd_disk *nbd) +{ + if (nbd == NULL) { + return; + } + + nbd->state = NBD_DISK_STATE_HARDDISC; + + /* + * Stop action should be called only after all nbd_io are executed. + */ + if (!spdk_nbd_cleanup_io(nbd)) { + _nbd_stop(nbd); + } +} + +static int64_t +read_from_socket(int fd, void *buf, size_t length) +{ + ssize_t bytes_read; + + bytes_read = read(fd, buf, length); + if (bytes_read == 0) { + return -EIO; + } else if (bytes_read == -1) { + if (errno != EAGAIN) { + return -errno; + } + return 0; + } else { + return bytes_read; + } +} + +static int64_t +write_to_socket(int fd, void *buf, size_t length) +{ + ssize_t bytes_written; + + bytes_written = write(fd, buf, length); + if (bytes_written == 0) { + return -EIO; + } else if (bytes_written == -1) { + if (errno != EAGAIN) { + return -errno; + } + return 0; + } else { + return bytes_written; + } +} + +static void +nbd_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct nbd_io *io = cb_arg; + struct spdk_nbd_disk *nbd = io->nbd; + + if (success) { + io->resp.error = 0; + } else { + to_be32(&io->resp.error, EIO); + } + + memcpy(&io->resp.handle, &io->req.handle, sizeof(io->resp.handle)); + TAILQ_INSERT_TAIL(&nbd->executed_io_list, io, tailq); + + if (bdev_io != NULL) { + spdk_bdev_free_io(bdev_io); + } + + if (nbd->state == NBD_DISK_STATE_HARDDISC && !spdk_nbd_cleanup_io(nbd)) { + _nbd_stop(nbd); + } +} + +static void +nbd_resubmit_io(void *arg) +{ + struct nbd_io *io = (struct nbd_io *)arg; + struct spdk_nbd_disk *nbd = io->nbd; + int rc = 0; + + rc = nbd_submit_bdev_io(nbd, io); + if (rc) { + SPDK_INFOLOG(SPDK_LOG_NBD, "nbd: io resubmit for dev %s , io_type %d, returned %d.\n", + spdk_nbd_disk_get_bdev_name(nbd), from_be32(&io->req.type), rc); + } +} + +static void +nbd_queue_io(struct nbd_io *io) +{ + int rc; + struct spdk_bdev *bdev = io->nbd->bdev; + + io->bdev_io_wait.bdev = bdev; + io->bdev_io_wait.cb_fn = nbd_resubmit_io; + io->bdev_io_wait.cb_arg = io; + + rc = spdk_bdev_queue_io_wait(bdev, io->nbd->ch, &io->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in nbd_queue_io, rc=%d.\n", rc); + nbd_io_done(NULL, false, io); + } +} + +static int +nbd_submit_bdev_io(struct spdk_nbd_disk *nbd, struct nbd_io *io) +{ + struct spdk_bdev_desc *desc = nbd->bdev_desc; + struct spdk_io_channel *ch = nbd->ch; + int rc = 0; + + switch (from_be32(&io->req.type)) { + case NBD_CMD_READ: + rc = spdk_bdev_read(desc, ch, io->payload, from_be64(&io->req.from), + io->payload_size, nbd_io_done, io); + break; + case NBD_CMD_WRITE: + rc = spdk_bdev_write(desc, ch, io->payload, from_be64(&io->req.from), + io->payload_size, nbd_io_done, io); + break; +#ifdef NBD_FLAG_SEND_FLUSH + case NBD_CMD_FLUSH: + rc = spdk_bdev_flush(desc, ch, 0, + spdk_bdev_get_num_blocks(nbd->bdev) * spdk_bdev_get_block_size(nbd->bdev), + nbd_io_done, io); + break; +#endif +#ifdef NBD_FLAG_SEND_TRIM + case NBD_CMD_TRIM: + rc = spdk_bdev_unmap(desc, ch, from_be64(&io->req.from), + from_be32(&io->req.len), nbd_io_done, io); + break; +#endif + case NBD_CMD_DISC: + spdk_put_nbd_io(nbd, io); + nbd->state = NBD_DISK_STATE_SOFTDISC; + break; + default: + rc = -1; + } + + if (rc < 0) { + if (rc == -ENOMEM) { + SPDK_INFOLOG(SPDK_LOG_NBD, "No memory, start to queue io.\n"); + nbd_queue_io(io); + } else { + SPDK_ERRLOG("nbd io failed in nbd_queue_io, rc=%d.\n", rc); + nbd_io_done(NULL, false, io); + } + } + + return 0; +} + +static int +spdk_nbd_io_exec(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io, *io_tmp; + int ret = 0; + + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + if (!TAILQ_EMPTY(&nbd->received_io_list)) { + TAILQ_FOREACH_SAFE(io, &nbd->received_io_list, tailq, io_tmp) { + TAILQ_REMOVE(&nbd->received_io_list, io, tailq); + ret = nbd_submit_bdev_io(nbd, io); + if (ret < 0) { + break; + } + } + } + + return ret; +} + +static int +spdk_nbd_io_recv_internal(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + int ret = 0; + + if (nbd->io_in_recv == NULL) { + nbd->io_in_recv = spdk_get_nbd_io(nbd); + if (!nbd->io_in_recv) { + return -ENOMEM; + } + } + + io = nbd->io_in_recv; + + if (io->state == NBD_IO_RECV_REQ) { + ret = read_from_socket(nbd->spdk_sp_fd, (char *)&io->req + io->offset, + sizeof(io->req) - io->offset); + if (ret < 0) { + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; + return ret; + } + + io->offset += ret; + + /* request is fully received */ + if (io->offset == sizeof(io->req)) { + io->offset = 0; + + /* req magic check */ + if (from_be32(&io->req.magic) != NBD_REQUEST_MAGIC) { + SPDK_ERRLOG("invalid request magic\n"); + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; + return -EINVAL; + } + + /* io except read/write should ignore payload */ + if (from_be32(&io->req.type) == NBD_CMD_WRITE || + from_be32(&io->req.type) == NBD_CMD_READ) { + io->payload_size = from_be32(&io->req.len); + } else { + io->payload_size = 0; + } + + /* io payload allocate */ + if (io->payload_size) { + io->payload = spdk_dma_malloc(io->payload_size, nbd->buf_align, NULL); + if (io->payload == NULL) { + SPDK_ERRLOG("could not allocate io->payload of size %d\n", io->payload_size); + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; + return -ENOMEM; + } + } else { + io->payload = NULL; + } + + /* next io step */ + if (from_be32(&io->req.type) == NBD_CMD_WRITE) { + io->state = NBD_IO_RECV_PAYLOAD; + } else { + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); + } + } + } + + if (io->state == NBD_IO_RECV_PAYLOAD) { + ret = read_from_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset); + if (ret < 0) { + spdk_put_nbd_io(nbd, io); + nbd->io_in_recv = NULL; + return ret; + } + + io->offset += ret; + + /* request payload is fully received */ + if (io->offset == io->payload_size) { + io->offset = 0; + io->state = NBD_IO_XMIT_RESP; + nbd->io_in_recv = NULL; + TAILQ_INSERT_TAIL(&nbd->received_io_list, io, tailq); + } + + } + + return 0; +} + +static int +spdk_nbd_io_recv(struct spdk_nbd_disk *nbd) +{ + int i, ret = 0; + + /* + * nbd server should not accept request in both soft and hard + * disconnect states. + */ + if (nbd->state != NBD_DISK_STATE_RUNNING) { + return 0; + } + + for (i = 0; i < GET_IO_LOOP_COUNT; i++) { + ret = spdk_nbd_io_recv_internal(nbd); + if (ret != 0) { + return ret; + } + } + + return 0; +} + +static int +spdk_nbd_io_xmit_internal(struct spdk_nbd_disk *nbd) +{ + struct nbd_io *io; + int ret = 0; + + io = TAILQ_FIRST(&nbd->executed_io_list); + if (io == NULL) { + return 0; + } + + /* Remove IO from list now assuming it will be completed. It will be inserted + * back to the head if it cannot be completed. This approach is specifically + * taken to work around a scan-build use-after-free mischaracterization. + */ + TAILQ_REMOVE(&nbd->executed_io_list, io, tailq); + + /* resp error and handler are already set in io_done */ + + if (io->state == NBD_IO_XMIT_RESP) { + ret = write_to_socket(nbd->spdk_sp_fd, (char *)&io->resp + io->offset, + sizeof(io->resp) - io->offset); + if (ret <= 0) { + goto reinsert; + } + + io->offset += ret; + + /* response is fully transmitted */ + if (io->offset == sizeof(io->resp)) { + io->offset = 0; + + /* transmit payload only when NBD_CMD_READ with no resp error */ + if (from_be32(&io->req.type) != NBD_CMD_READ || io->resp.error != 0) { + spdk_put_nbd_io(nbd, io); + return 0; + } else { + io->state = NBD_IO_XMIT_PAYLOAD; + } + } + } + + if (io->state == NBD_IO_XMIT_PAYLOAD) { + ret = write_to_socket(nbd->spdk_sp_fd, io->payload + io->offset, io->payload_size - io->offset); + if (ret <= 0) { + goto reinsert; + } + + io->offset += ret; + + /* read payload is fully transmitted */ + if (io->offset == io->payload_size) { + spdk_put_nbd_io(nbd, io); + return 0; + } + } + +reinsert: + TAILQ_INSERT_HEAD(&nbd->executed_io_list, io, tailq); + return ret; +} + +static int +spdk_nbd_io_xmit(struct spdk_nbd_disk *nbd) +{ + int ret = 0; + + /* + * For soft disconnection, nbd server must handle all outstanding + * request before closing connection. + */ + if (nbd->state == NBD_DISK_STATE_HARDDISC) { + return 0; + } + + while (!TAILQ_EMPTY(&nbd->executed_io_list)) { + ret = spdk_nbd_io_xmit_internal(nbd); + if (ret != 0) { + return ret; + } + } + + /* + * For soft disconnection, nbd server can close connection after all + * outstanding request are transmitted. + */ + if (nbd->state == NBD_DISK_STATE_SOFTDISC && !spdk_nbd_io_xmit_check(nbd)) { + return -1; + } + + return 0; +} + +/** + * Poll an NBD instance. + * + * \return 0 on success or negated errno values on error (e.g. connection closed). + */ +static int +_spdk_nbd_poll(struct spdk_nbd_disk *nbd) +{ + int rc; + + /* transmit executed io first */ + rc = spdk_nbd_io_xmit(nbd); + if (rc < 0) { + return rc; + } + + rc = spdk_nbd_io_recv(nbd); + if (rc < 0) { + return rc; + } + + rc = spdk_nbd_io_exec(nbd); + + return rc; +} + +static int +spdk_nbd_poll(void *arg) +{ + struct spdk_nbd_disk *nbd = arg; + int rc; + + rc = _spdk_nbd_poll(nbd); + if (rc < 0) { + SPDK_INFOLOG(SPDK_LOG_NBD, "spdk_nbd_poll() returned %s (%d); closing connection\n", + spdk_strerror(-rc), rc); + spdk_nbd_stop(nbd); + } + + return -1; +} + +static void * +nbd_start_kernel(void *arg) +{ + int dev_fd = (int)(intptr_t)arg; + + spdk_unaffinitize_thread(); + + /* This will block in the kernel until we close the spdk_sp_fd. */ + ioctl(dev_fd, NBD_DO_IT); + + pthread_exit(NULL); +} + +static void +spdk_nbd_bdev_hot_remove(void *remove_ctx) +{ + struct spdk_nbd_disk *nbd = remove_ctx; + + spdk_nbd_stop(nbd); +} + +struct spdk_nbd_disk * +spdk_nbd_start(const char *bdev_name, const char *nbd_path) +{ + struct spdk_nbd_disk *nbd; + struct spdk_bdev *bdev; + pthread_t tid; + int rc; + int sp[2]; + int flag; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("no bdev %s exists\n", bdev_name); + return NULL; + } + + nbd = calloc(1, sizeof(*nbd)); + if (nbd == NULL) { + return NULL; + } + + nbd->dev_fd = -1; + nbd->spdk_sp_fd = -1; + nbd->kernel_sp_fd = -1; + + rc = spdk_bdev_open(bdev, true, spdk_nbd_bdev_hot_remove, nbd, &nbd->bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("could not open bdev %s, error=%d\n", spdk_bdev_get_name(bdev), rc); + goto err; + } + + nbd->bdev = bdev; + + nbd->ch = spdk_bdev_get_io_channel(nbd->bdev_desc); + nbd->buf_align = spdk_max(spdk_bdev_get_buf_align(bdev), 64); + + rc = socketpair(AF_UNIX, SOCK_STREAM, 0, sp); + if (rc != 0) { + SPDK_ERRLOG("socketpair failed\n"); + goto err; + } + + nbd->spdk_sp_fd = sp[0]; + nbd->kernel_sp_fd = sp[1]; + nbd->nbd_path = strdup(nbd_path); + if (!nbd->nbd_path) { + SPDK_ERRLOG("strdup allocation failure\n"); + goto err; + } + + TAILQ_INIT(&nbd->received_io_list); + TAILQ_INIT(&nbd->executed_io_list); + + /* Add nbd_disk to the end of disk list */ + rc = spdk_nbd_disk_register(nbd); + if (rc != 0) { + goto err; + } + + nbd->dev_fd = open(nbd_path, O_RDWR); + if (nbd->dev_fd == -1) { + SPDK_ERRLOG("open(\"%s\") failed: %s\n", nbd_path, spdk_strerror(errno)); + goto err; + } + + rc = ioctl(nbd->dev_fd, NBD_SET_BLKSIZE, spdk_bdev_get_block_size(bdev)); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_BLKSIZE) failed: %s\n", spdk_strerror(errno)); + goto err; + } + + rc = ioctl(nbd->dev_fd, NBD_SET_SIZE_BLOCKS, spdk_bdev_get_num_blocks(bdev)); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_SIZE_BLOCKS) failed: %s\n", spdk_strerror(errno)); + goto err; + } + + rc = ioctl(nbd->dev_fd, NBD_CLEAR_SOCK); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_CLEAR_SOCK) failed: %s\n", spdk_strerror(errno)); + goto err; + } + + SPDK_INFOLOG(SPDK_LOG_NBD, "Enabling kernel access to bdev %s via %s\n", + spdk_bdev_get_name(bdev), nbd_path); + + rc = ioctl(nbd->dev_fd, NBD_SET_SOCK, nbd->kernel_sp_fd); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_SOCK) failed: %s\n", spdk_strerror(errno)); + goto err; + } + +#ifdef NBD_FLAG_SEND_TRIM + rc = ioctl(nbd->dev_fd, NBD_SET_FLAGS, NBD_FLAG_SEND_TRIM); + if (rc == -1) { + SPDK_ERRLOG("ioctl(NBD_SET_FLAGS) failed: %s\n", spdk_strerror(errno)); + goto err; + } +#endif + + rc = pthread_create(&tid, NULL, nbd_start_kernel, (void *)(intptr_t)nbd->dev_fd); + if (rc != 0) { + SPDK_ERRLOG("could not create thread: %s\n", spdk_strerror(rc)); + goto err; + } + + rc = pthread_detach(tid); + if (rc != 0) { + SPDK_ERRLOG("could not detach thread for nbd kernel: %s\n", spdk_strerror(rc)); + goto err; + } + + flag = fcntl(nbd->spdk_sp_fd, F_GETFL); + if (fcntl(nbd->spdk_sp_fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + nbd->spdk_sp_fd, spdk_strerror(errno)); + goto err; + } + + nbd->nbd_poller = spdk_poller_register(spdk_nbd_poll, nbd, 0); + + return nbd; + +err: + spdk_nbd_stop(nbd); + + return NULL; +} + +SPDK_LOG_REGISTER_COMPONENT("nbd", SPDK_LOG_NBD) diff --git a/src/spdk/lib/nbd/nbd_internal.h b/src/spdk/lib/nbd/nbd_internal.h new file mode 100644 index 00000000..adf1cb21 --- /dev/null +++ b/src/spdk/lib/nbd/nbd_internal.h @@ -0,0 +1,52 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NBD_INTERNAL_H +#define SPDK_NBD_INTERNAL_H + +#include "spdk/stdinc.h" +#include "spdk/nbd.h" + +struct spdk_nbd_disk *spdk_nbd_disk_find_by_nbd_path(const char *nbd_path); + +struct spdk_nbd_disk *spdk_nbd_disk_first(void); + +struct spdk_nbd_disk *spdk_nbd_disk_next(struct spdk_nbd_disk *prev); + +const char *spdk_nbd_disk_get_nbd_path(struct spdk_nbd_disk *nbd); + +const char *spdk_nbd_disk_get_bdev_name(struct spdk_nbd_disk *nbd); + +void nbd_disconnect(struct spdk_nbd_disk *nbd); + +#endif /* SPDK_NBD_INTERNAL_H */ diff --git a/src/spdk/lib/nbd/nbd_rpc.c b/src/spdk/lib/nbd/nbd_rpc.c new file mode 100644 index 00000000..bec64a13 --- /dev/null +++ b/src/spdk/lib/nbd/nbd_rpc.c @@ -0,0 +1,304 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/string.h" +#include "spdk/env.h" +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include + +#include "nbd_internal.h" +#include "spdk_internal/log.h" + +struct rpc_start_nbd_disk { + char *bdev_name; + char *nbd_device; +}; + +static void +free_rpc_start_nbd_disk(struct rpc_start_nbd_disk *req) +{ + free(req->bdev_name); + free(req->nbd_device); +} + +static const struct spdk_json_object_decoder rpc_start_nbd_disk_decoders[] = { + {"bdev_name", offsetof(struct rpc_start_nbd_disk, bdev_name), spdk_json_decode_string}, + {"nbd_device", offsetof(struct rpc_start_nbd_disk, nbd_device), spdk_json_decode_string}, +}; + +static void +spdk_rpc_start_nbd_disk(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_start_nbd_disk req = {}; + struct spdk_json_write_ctx *w; + struct spdk_nbd_disk *nbd; + + if (spdk_json_decode_object(params, rpc_start_nbd_disk_decoders, + SPDK_COUNTOF(rpc_start_nbd_disk_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.nbd_device == NULL || req.bdev_name == NULL) { + goto invalid; + } + + /* make sure nbd_device is not registered */ + nbd = spdk_nbd_disk_find_by_nbd_path(req.nbd_device); + if (nbd) { + goto invalid; + } + + nbd = spdk_nbd_start(req.bdev_name, req.nbd_device); + if (!nbd) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free_rpc_start_nbd_disk(&req); + return; + } + + spdk_json_write_string(w, req.nbd_device); + spdk_jsonrpc_end_result(request, w); + free_rpc_start_nbd_disk(&req); + return; + +invalid: + free_rpc_start_nbd_disk(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); +} + +SPDK_RPC_REGISTER("start_nbd_disk", spdk_rpc_start_nbd_disk, SPDK_RPC_RUNTIME) + +struct rpc_stop_nbd_disk { + char *nbd_device; +}; + +static void +free_rpc_stop_nbd_disk(struct rpc_stop_nbd_disk *req) +{ + free(req->nbd_device); +} + +static const struct spdk_json_object_decoder rpc_stop_nbd_disk_decoders[] = { + {"nbd_device", offsetof(struct rpc_stop_nbd_disk, nbd_device), spdk_json_decode_string}, +}; + +struct nbd_disconnect_arg { + struct spdk_jsonrpc_request *request; + struct spdk_nbd_disk *nbd; +}; + +static void * +nbd_disconnect_thread(void *arg) +{ + struct nbd_disconnect_arg *thd_arg = arg; + struct spdk_json_write_ctx *w; + + spdk_unaffinitize_thread(); + + nbd_disconnect(thd_arg->nbd); + + w = spdk_jsonrpc_begin_result(thd_arg->request); + if (w == NULL) { + goto out; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(thd_arg->request, w); + +out: + free(thd_arg); + pthread_exit(NULL); +} + +static void +spdk_rpc_stop_nbd_disk(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_stop_nbd_disk req = {}; + struct spdk_nbd_disk *nbd; + pthread_t tid; + struct nbd_disconnect_arg *thd_arg = NULL; + int rc; + + if (spdk_json_decode_object(params, rpc_stop_nbd_disk_decoders, + SPDK_COUNTOF(rpc_stop_nbd_disk_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto out; + } + + if (req.nbd_device == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto out; + } + + /* make sure nbd_device is registered */ + nbd = spdk_nbd_disk_find_by_nbd_path(req.nbd_device); + if (!nbd) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + goto out; + } + + /* + * thd_arg should be freed by created thread + * if thread is created successfully. + */ + thd_arg = malloc(sizeof(*thd_arg)); + if (!thd_arg) { + SPDK_ERRLOG("could not allocate nbd disconnect thread arg\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, "Out of memory"); + goto out; + } + + thd_arg->request = request; + thd_arg->nbd = nbd; + + /* + * NBD ioctl of disconnect will block until data are flushed. + * Create separate thread to execute it. + */ + rc = pthread_create(&tid, NULL, nbd_disconnect_thread, (void *)thd_arg); + if (rc != 0) { + SPDK_ERRLOG("could not create nbd disconnect thread: %s\n", spdk_strerror(rc)); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, spdk_strerror(rc)); + free(thd_arg); + goto out; + } + + rc = pthread_detach(tid); + if (rc != 0) { + SPDK_ERRLOG("could not detach nbd disconnect thread: %s\n", spdk_strerror(rc)); + goto out; + } + +out: + free_rpc_stop_nbd_disk(&req); +} + +SPDK_RPC_REGISTER("stop_nbd_disk", spdk_rpc_stop_nbd_disk, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_dump_nbd_info(struct spdk_json_write_ctx *w, + struct spdk_nbd_disk *nbd) +{ + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "nbd_device"); + spdk_json_write_string(w, spdk_nbd_disk_get_nbd_path(nbd)); + + spdk_json_write_name(w, "bdev_name"); + spdk_json_write_string(w, spdk_nbd_disk_get_bdev_name(nbd)); + + spdk_json_write_object_end(w); +} + +struct rpc_get_nbd_disks { + char *nbd_device; +}; + +static void +free_rpc_get_nbd_disks(struct rpc_get_nbd_disks *r) +{ + free(r->nbd_device); +} + +static const struct spdk_json_object_decoder rpc_get_nbd_disks_decoders[] = { + {"nbd_device", offsetof(struct rpc_get_nbd_disks, nbd_device), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_nbd_disks(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_nbd_disks req = {}; + struct spdk_json_write_ctx *w; + struct spdk_nbd_disk *nbd = NULL; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_get_nbd_disks_decoders, + SPDK_COUNTOF(rpc_get_nbd_disks_decoders), + &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + goto invalid; + } + + if (req.nbd_device) { + nbd = spdk_nbd_disk_find_by_nbd_path(req.nbd_device); + if (nbd == NULL) { + SPDK_ERRLOG("nbd device '%s' does not exist\n", req.nbd_device); + goto invalid; + } + + free_rpc_get_nbd_disks(&req); + } + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + if (nbd != NULL) { + spdk_rpc_dump_nbd_info(w, nbd); + } else { + for (nbd = spdk_nbd_disk_first(); nbd != NULL; nbd = spdk_nbd_disk_next(nbd)) { + spdk_rpc_dump_nbd_info(w, nbd); + } + } + + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); + + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + + free_rpc_get_nbd_disks(&req); +} +SPDK_RPC_REGISTER("get_nbd_disks", spdk_rpc_get_nbd_disks, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/net/Makefile b/src/spdk/lib/net/Makefile new file mode 100644 index 00000000..6431e7be --- /dev/null +++ b/src/spdk/lib/net/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = interface.c net_rpc.c + +LIBNAME = net + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/net/interface.c b/src/spdk/lib/net/interface.c new file mode 100644 index 00000000..5102695b --- /dev/null +++ b/src/spdk/lib/net/interface.c @@ -0,0 +1,505 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "net_internal.h" + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include "spdk/log.h" +#include "spdk/net.h" + +#ifdef __linux__ /* Interface management is Linux-specific */ + +#include +#include + +static TAILQ_HEAD(, spdk_interface) g_interface_head; + +static pthread_mutex_t interface_lock = PTHREAD_MUTEX_INITIALIZER; + +static int spdk_get_ifc_ipv4(void) +{ + int ret; + int rtattrlen; + int netlink_fd; + uint32_t ipv4_addr; + + struct { + struct nlmsghdr n; + struct ifaddrmsg r; + struct rtattr rta; + } req; + char buf[16384]; + struct nlmsghdr *nlmp; + struct ifaddrmsg *rtmp; + struct rtattr *rtatp; + struct spdk_interface *ifc; + + netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); + if (netlink_fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return 1; + } + + /* + * Prepare a message structure + */ + memset(&req, 0, sizeof(req)); + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT; + req.n.nlmsg_type = RTM_GETADDR; + + /* IPv4 only */ + req.r.ifa_family = AF_INET; + + /* + * Fill up all the attributes for the rtnetlink header. + */ + assert(&req.rta == (struct rtattr *)(((char *)&req) + NLMSG_ALIGN(req.n.nlmsg_len))); + req.rta.rta_len = RTA_LENGTH(16); + + /* Send and recv the message from kernel */ + ret = send(netlink_fd, &req, req.n.nlmsg_len, 0); + if (ret < 0) { + SPDK_ERRLOG("netlink send failed: %s\n", spdk_strerror(errno)); + ret = 1; + goto exit; + } + + ret = recv(netlink_fd, buf, sizeof(buf), 0); + if (ret <= 0) { + SPDK_ERRLOG("netlink recv failed: %s\n", spdk_strerror(errno)); + ret = 1; + goto exit; + } + + for (nlmp = (struct nlmsghdr *)buf; ret > (int)sizeof(*nlmp);) { + int len = nlmp->nlmsg_len; + int req_len = len - sizeof(*nlmp); + + if (req_len < 0 || len > ret) { + SPDK_ERRLOG("error\n"); + ret = 1; + goto exit; + } + + if (!NLMSG_OK(nlmp, (uint32_t)ret)) { + SPDK_ERRLOG("NLMSG not OK\n"); + ret = 1; + goto exit; + } + + rtmp = (struct ifaddrmsg *)NLMSG_DATA(nlmp); + rtatp = (struct rtattr *)IFA_RTA(rtmp); + + rtattrlen = IFA_PAYLOAD(nlmp); + + for (; RTA_OK(rtatp, rtattrlen); rtatp = RTA_NEXT(rtatp, rtattrlen)) { + if (rtatp->rta_type == IFA_LOCAL) { + memcpy(&ipv4_addr, (struct in_addr *)RTA_DATA(rtatp), + sizeof(struct in_addr)); + TAILQ_FOREACH(ifc, &g_interface_head, tailq) { + if (ifc->index == rtmp->ifa_index) { + /* add a new IP address to interface */ + if (ifc->num_ip_addresses >= SPDK_MAX_IP_PER_IFC) { + SPDK_ERRLOG("SPDK: number of IP addresses supported for %s excceded. limit=%d\n", + ifc->name, + SPDK_MAX_IP_PER_IFC); + break; + } + ifc->ip_address[ifc->num_ip_addresses] = ipv4_addr; + ifc->num_ip_addresses++; + break; + } + } + } + } + ret -= NLMSG_ALIGN(len); + nlmp = (struct nlmsghdr *)((char *)nlmp + NLMSG_ALIGN(len)); + } + ret = 0; + +exit: + close(netlink_fd); + return ret; +} + + +static int spdk_process_new_interface_msg(struct nlmsghdr *h) +{ + int len; + struct spdk_interface *ifc; + struct ifinfomsg *iface; + struct rtattr *attribute; + + iface = (struct ifinfomsg *)NLMSG_DATA(h); + + ifc = (struct spdk_interface *) malloc(sizeof(*ifc)); + if (ifc == NULL) { + SPDK_ERRLOG("%s: Malloc failed\n", __func__); + return 1; + } + + memset(ifc, 0, sizeof(*ifc)); + + /* Set interface index */ + ifc->index = iface->ifi_index; + + len = h->nlmsg_len - NLMSG_LENGTH(sizeof(*iface)); + + /* Loop over all attributes for the NEWLINK message */ + for (attribute = IFLA_RTA(iface); RTA_OK(attribute, len); attribute = RTA_NEXT(attribute, len)) { + switch (attribute->rta_type) { + case IFLA_IFNAME: + if (if_indextoname(iface->ifi_index, ifc->name) == NULL) { + SPDK_ERRLOG("Indextoname failed!\n"); + free(ifc); + return 2; + } + break; + default: + break; + } + } + TAILQ_INSERT_TAIL(&g_interface_head, ifc, tailq); + return 0; +} + +static int spdk_prepare_ifc_list(void) +{ + int ret = 0; + struct nl_req_s { + struct nlmsghdr hdr; + struct rtgenmsg gen; + struct ifinfomsg ifi; + }; + int netlink_fd; + struct sockaddr_nl local; /* Our local (user space) side of the communication */ + struct sockaddr_nl kernel; /* The remote (kernel space) side of the communication */ + + struct msghdr rtnl_msg; /* Generic msghdr struct for use with sendmsg */ + struct iovec io; /* IO vector for sendmsg */ + + struct nl_req_s req; /* Structure that describes the rtnetlink packet itself */ + char reply[16384]; /* a large buffer to receive lots of link information */ + + pid_t pid = getpid(); /* Our process ID to build the correct netlink address */ + int end = 0; /* some flag to end loop parsing */ + + /* + * Prepare netlink socket for kernel/user space communication + */ + netlink_fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (netlink_fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return 1; + } + + memset(&local, 0, sizeof(local)); /* Fill-in local address information */ + local.nl_family = AF_NETLINK; + local.nl_pid = pid; + local.nl_groups = 0; + + /* RTNL socket is ready to use, prepare and send L2 request. */ + memset(&rtnl_msg, 0, sizeof(rtnl_msg)); + memset(&kernel, 0, sizeof(kernel)); + memset(&req, 0, sizeof(req)); + + kernel.nl_family = AF_NETLINK; /* Fill-in kernel address (destination of our message) */ + + req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtgenmsg)); + req.hdr.nlmsg_type = RTM_GETLINK; + req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.hdr.nlmsg_seq = 1; + req.hdr.nlmsg_pid = pid; + + req.ifi.ifi_family = AF_UNSPEC; + req.ifi.ifi_type = 1; + + io.iov_base = &req; + io.iov_len = req.hdr.nlmsg_len; + rtnl_msg.msg_iov = &io; + rtnl_msg.msg_iovlen = 1; + rtnl_msg.msg_name = &kernel; + rtnl_msg.msg_namelen = sizeof(kernel); + + if (sendmsg(netlink_fd, &rtnl_msg, 0) == -1) { + SPDK_ERRLOG("Sendmsg failed!\n"); + ret = 1; + goto exit; + } + + /* Parse reply */ + while (!end) { + int len; + struct nlmsghdr *msg_ptr; /* Pointer to current message part */ + + struct msghdr rtnl_reply; /* Generic msghdr structure for use with recvmsg */ + struct iovec io_reply; + + memset(&io_reply, 0, sizeof(io_reply)); + memset(&rtnl_reply, 0, sizeof(rtnl_reply)); + + io.iov_base = reply; + io.iov_len = 8192; + rtnl_reply.msg_iov = &io; + rtnl_reply.msg_iovlen = 1; + rtnl_reply.msg_name = &kernel; + rtnl_reply.msg_namelen = sizeof(kernel); + + /* Read as much data as fits in the receive buffer */ + len = recvmsg(netlink_fd, &rtnl_reply, 0); + if (len) { + for (msg_ptr = (struct nlmsghdr *) reply; NLMSG_OK(msg_ptr, (uint32_t)len); + msg_ptr = NLMSG_NEXT(msg_ptr, len)) { + switch (msg_ptr->nlmsg_type) { + case NLMSG_DONE: /* This is the special meaning NLMSG_DONE message we asked for by using NLM_F_DUMP flag */ + end++; + break; + case RTM_NEWLINK: /* This is a RTM_NEWLINK message, which contains lots of information about a link */ + ret = spdk_process_new_interface_msg(msg_ptr); + if (ret != 0) { + goto exit; + } + break; + default: + break; + } + } + } + } +exit: + close(netlink_fd); + return ret; +} + +static int spdk_interface_available(uint32_t ifc_index) +{ + struct spdk_interface *ifc_entry; + + pthread_mutex_lock(&interface_lock); + TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) { + if (ifc_entry->index == ifc_index) { + pthread_mutex_unlock(&interface_lock); + return 0; + } + } + pthread_mutex_unlock(&interface_lock); + + return -1; +} + +static int netlink_addr_msg(uint32_t ifc_idx, uint32_t ip_address, uint32_t create) +{ + int fd; + struct sockaddr_nl la; + struct sockaddr_nl pa; + struct msghdr msg; + struct iovec iov; + int ifal; + struct { + struct nlmsghdr n; + struct ifaddrmsg r; + char buf[16384]; + } req; + struct rtattr *rta; + + if (spdk_interface_available(ifc_idx)) { + return -1; + } + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + SPDK_ERRLOG("socket failed!\n"); + return -1; + } + + /* setup local address & bind using this address. */ + bzero(&la, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_pid = getpid(); + bind(fd, (struct sockaddr *) &la, sizeof(la)); + + /* initialize RTNETLINK request buffer. */ + bzero(&req, sizeof(req)); + + /* compute the initial length of the service request. */ + ifal = sizeof(struct ifaddrmsg); + + /* add first attrib: set IP addr and RTNETLINK buffer size. */ + rta = (struct rtattr *) req.buf; + rta->rta_type = IFA_ADDRESS; + rta->rta_len = sizeof(struct rtattr) + 4; + memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address)); + ifal += rta->rta_len; + + /* add second attrib. */ + rta = (struct rtattr *)(((char *)rta) + rta->rta_len); + rta->rta_type = IFA_LOCAL; + rta->rta_len = sizeof(struct rtattr) + 4; + memcpy(((char *)rta) + sizeof(struct rtattr), &ip_address, sizeof(ip_address)); + ifal += rta->rta_len; + + /* setup the NETLINK header. */ + req.n.nlmsg_len = NLMSG_LENGTH(ifal); + if (create) { + req.n.nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_APPEND; + req.n.nlmsg_type = RTM_NEWADDR; + } else { + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_DELADDR; + } + + /* setup the service header (struct rtmsg). */ + req.r.ifa_family = AF_INET; + req.r.ifa_prefixlen = 32; /* hardcoded */ + req.r.ifa_flags = IFA_F_PERMANENT | IFA_F_SECONDARY; + req.r.ifa_index = ifc_idx; + req.r.ifa_scope = 0; + + /* create the remote address to communicate. */ + bzero(&pa, sizeof(pa)); + pa.nl_family = AF_NETLINK; + + /* initialize & create the struct msghdr supplied to the sendmsg() function. */ + bzero(&msg, sizeof(msg)); + msg.msg_name = (void *) &pa; + msg.msg_namelen = sizeof(pa); + + /* place the pointer & size of the RTNETLINK message in the struct msghdr. */ + iov.iov_base = (void *) &req.n; + iov.iov_len = req.n.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + /* send the RTNETLINK message to kernel. */ + sendmsg(fd, &msg, 0); + close(fd); + return 0; +} + +static void spdk_interface_ip_update(void) +{ + struct spdk_interface *ifc_entry; + + pthread_mutex_lock(&interface_lock); + TAILQ_FOREACH(ifc_entry, &g_interface_head, tailq) { + ifc_entry->num_ip_addresses = 0; + memset(ifc_entry->ip_address, 0, sizeof(ifc_entry->ip_address)); + } + spdk_get_ifc_ipv4(); + pthread_mutex_unlock(&interface_lock); +} + +int +spdk_interface_init(void) +{ + int rc = 0; + + TAILQ_INIT(&g_interface_head); + rc = spdk_prepare_ifc_list(); + if (!rc) { + rc = spdk_get_ifc_ipv4(); + } + + return rc; +} + +void +spdk_interface_destroy(void) +{ + struct spdk_interface *ifc_entry; + + while (!TAILQ_EMPTY(&g_interface_head)) { + ifc_entry = TAILQ_FIRST(&g_interface_head); + TAILQ_REMOVE(&g_interface_head, ifc_entry, tailq); + free(ifc_entry); + } +} + +int +spdk_interface_add_ip_address(int ifc_index, char *ip_addr) +{ + uint32_t addr; + + addr = inet_addr(ip_addr); + return netlink_addr_msg(ifc_index, addr, 1); +} + +int +spdk_interface_delete_ip_address(int ifc_index, char *ip_addr) +{ + uint32_t addr; + + addr = inet_addr(ip_addr); + return netlink_addr_msg(ifc_index, addr, 0); +} + +void *spdk_interface_get_list(void) +{ + spdk_interface_ip_update(); + return &g_interface_head; +} + +#else /* Not Linux */ + +int +spdk_interface_init(void) +{ + return 0; +} + +void +spdk_interface_destroy(void) +{ +} + +int +spdk_interface_add_ip_address(int ifc_index, char *ip_addr) +{ + return -1; +} + +int +spdk_interface_delete_ip_address(int ifc_index, char *ip_addr) +{ + return -1; +} + +void * +spdk_interface_get_list(void) +{ + return NULL; +} + +#endif diff --git a/src/spdk/lib/net/net_internal.h b/src/spdk/lib/net/net_internal.h new file mode 100644 index 00000000..8dbaf633 --- /dev/null +++ b/src/spdk/lib/net/net_internal.h @@ -0,0 +1,79 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NET_INTERNAL_H +#define SPDK_NET_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" + +#define SPDK_IFNAMSIZE 32 +#define SPDK_MAX_IP_PER_IFC 32 + +struct spdk_interface { + char name[SPDK_IFNAMSIZE]; + uint32_t index; + uint32_t num_ip_addresses; /* number of IP addresses defined */ + uint32_t ip_address[SPDK_MAX_IP_PER_IFC]; + TAILQ_ENTRY(spdk_interface) tailq; +}; + +/** + * Add an ip address to the network interface. + * + * \param ifc_index Index of the network interface. + * \param ip_addr Ip address to add. + * + * \return 0 on success, -1 on failure. + */ +int spdk_interface_add_ip_address(int ifc_index, char *ip_addr); + +/** + * Delete an ip address from the network interface. + * + * \param ifc_index Index of the network interface. + * \param ip_addr Ip address to delete. + * + * \return 0 on success, -1 on failure. + */ +int spdk_interface_delete_ip_address(int ifc_index, char *ip_addr); + +/** + * Get the list of all the network interfaces. + * + * \return a pointer to the head of the linked list of all the network interfaces. + */ +void *spdk_interface_get_list(void); + +#endif /* SPDK_NET_INTERNAL_H */ diff --git a/src/spdk/lib/net/net_rpc.c b/src/spdk/lib/net/net_rpc.c new file mode 100644 index 00000000..aaaf6865 --- /dev/null +++ b/src/spdk/lib/net/net_rpc.c @@ -0,0 +1,180 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "net_internal.h" + +#include "spdk/stdinc.h" + +#include "spdk/rpc.h" +#include "spdk/net.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +struct rpc_ip_address { + int32_t ifc_index; + char *ip_address; +}; + +static void +free_rpc_ip_address(struct rpc_ip_address *req) +{ + free(req->ip_address); +} + +static const struct spdk_json_object_decoder rpc_ip_address_decoders[] = { + {"ifc_index", offsetof(struct rpc_ip_address, ifc_index), spdk_json_decode_int32}, + {"ip_address", offsetof(struct rpc_ip_address, ip_address), spdk_json_decode_string}, +}; + +static void +spdk_rpc_add_ip_address(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_ip_address req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_ip_address_decoders, + SPDK_COUNTOF(rpc_ip_address_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (spdk_interface_add_ip_address(req.ifc_index, req.ip_address)) { + goto invalid; + } + + free_rpc_ip_address(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_ip_address(&req); +} +SPDK_RPC_REGISTER("add_ip_address", spdk_rpc_add_ip_address, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_delete_ip_address(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_ip_address req = {}; + struct spdk_json_write_ctx *w; + + if (spdk_json_decode_object(params, rpc_ip_address_decoders, + SPDK_COUNTOF(rpc_ip_address_decoders), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_NET, "spdk_json_decode_object failed\n"); + goto invalid; + } + + if (spdk_interface_delete_ip_address(req.ifc_index, req.ip_address)) { + goto invalid; + } + + free_rpc_ip_address(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, "Invalid parameters"); + free_rpc_ip_address(&req); +} +SPDK_RPC_REGISTER("delete_ip_address", spdk_rpc_delete_ip_address, SPDK_RPC_RUNTIME) + +static void +spdk_rpc_get_interfaces(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + TAILQ_HEAD(, spdk_interface) *interface_head = spdk_interface_get_list(); + struct spdk_interface *ifc; + char *ip_address; + struct in_addr inaddr; + uint32_t i; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_interfaces requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + TAILQ_FOREACH(ifc, interface_head, tailq) { + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "name"); + spdk_json_write_string(w, ifc->name); + + spdk_json_write_name(w, "ifc_index"); + spdk_json_write_int32(w, ifc->index); + + spdk_json_write_name(w, "ip_addr"); + spdk_json_write_array_begin(w); + for (i = 0; i < ifc->num_ip_addresses; i++) { + memcpy(&inaddr, &ifc->ip_address[i], sizeof(uint32_t)); + ip_address = inet_ntoa(inaddr); + spdk_json_write_string(w, ip_address); + } + spdk_json_write_array_end(w); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_interfaces", spdk_rpc_get_interfaces, SPDK_RPC_RUNTIME) + +SPDK_LOG_REGISTER_COMPONENT("net", SPDK_LOG_NET) diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile new file mode 100644 index 00000000..3351c87c --- /dev/null +++ b/src/spdk/lib/nvme/Makefile @@ -0,0 +1,61 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \ + nvme_ns_ocssd_cmd.c +C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c +LIBNAME = nvme +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c new file mode 100644 index 00000000..dc657966 --- /dev/null +++ b/src/spdk/lib/nvme/nvme.c @@ -0,0 +1,862 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvmf_spec.h" +#include "nvme_internal.h" + +#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver" + +struct nvme_driver *g_spdk_nvme_driver; +pid_t g_spdk_nvme_pid; + +int32_t spdk_nvme_retry_count; + +/* gross timeout of 180 seconds in milliseconds */ +static int g_nvme_driver_timeout_ms = 3 * 60 * 1000; + +static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_init_ctrlrs = + TAILQ_HEAD_INITIALIZER(g_nvme_init_ctrlrs); + +/* Per-process attached controller list */ +static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs = + TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs); + +/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */ +static bool +nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE; +} + +/* Caller must hold g_spdk_nvme_driver->lock */ +void +nvme_ctrlr_connected(struct spdk_nvme_ctrlr *ctrlr) +{ + TAILQ_INSERT_TAIL(&g_nvme_init_ctrlrs, ctrlr, tailq); +} + +int +spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + nvme_ctrlr_proc_put_ref(ctrlr); + + if (nvme_ctrlr_get_ref_count(ctrlr) == 0) { + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + nvme_ctrlr_destruct(ctrlr); + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return 0; +} + +void +nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_completion_poll_status *status = arg; + + /* + * Copy status into the argument passed by the caller, so that + * the caller can check the status to determine if the + * the request passed or failed. + */ + memcpy(&status->cpl, cpl, sizeof(*cpl)); + status->done = true; +} + +/** + * Poll qpair for completions until a command completes. + * + * \param qpair queue to poll + * \param status completion status + * \param robust_mutex optional robust mutex to lock while polling qpair + * + * \return 0 if command completed without error, negative errno on failure + * + * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback + * and status as the callback argument. + */ +int +spdk_nvme_wait_for_completion_robust_lock( + struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex) +{ + memset(&status->cpl, 0, sizeof(status->cpl)); + status->done = false; + + while (status->done == false) { + if (robust_mutex) { + nvme_robust_mutex_lock(robust_mutex); + } + + spdk_nvme_qpair_process_completions(qpair, 0); + + if (robust_mutex) { + nvme_robust_mutex_unlock(robust_mutex); + } + } + + return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0; +} + +int +spdk_nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status) +{ + return spdk_nvme_wait_for_completion_robust_lock(qpair, status, NULL); +} + +static void +nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = arg; + enum spdk_nvme_data_transfer xfer; + + if (req->user_buffer && req->payload_size) { + /* Copy back to the user buffer and free the contig buffer */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); + if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST || + xfer == SPDK_NVME_DATA_BIDIRECTIONAL) { + assert(req->pid == getpid()); + memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size); + } + + spdk_dma_free(req->payload.contig_or_cb_arg); + } + + /* Call the user's original callback now that the buffer has been copied */ + req->user_cb_fn(req->user_cb_arg, cpl); +} + +/** + * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer. + * + * This is intended for use in non-fast-path functions (admin commands, reservations, etc.) + * where the overhead of a copy is not a problem. + */ +struct nvme_request * +nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, bool host_to_controller) +{ + struct nvme_request *req; + void *dma_buffer = NULL; + uint64_t phys_addr; + + if (buffer && payload_size) { + dma_buffer = spdk_zmalloc(payload_size, 4096, &phys_addr, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!dma_buffer) { + return NULL; + } + + if (host_to_controller) { + memcpy(dma_buffer, buffer, payload_size); + } + } + + req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete, + NULL); + if (!req) { + spdk_free(dma_buffer); + return NULL; + } + + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + req->user_buffer = buffer; + req->cb_arg = req; + + return req; +} + +/** + * Check if a request has exceeded the controller timeout. + * + * \param req request to check for timeout. + * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn) + * \param active_proc per-process data for the controller associated with req + * \param now_tick current time from spdk_get_ticks() + * \return 0 if requests submitted more recently than req should still be checked for timeouts, or + * 1 if requests newer than req need not be checked. + * + * The request's timeout callback will be called if needed; the caller is only responsible for + * calling this function on each outstanding request. + */ +int +nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, + uint64_t now_tick) +{ + struct spdk_nvme_qpair *qpair = req->qpair; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + assert(active_proc->timeout_cb_fn != NULL); + + if (req->timed_out || req->submit_tick == 0) { + return 0; + } + + if (req->pid != g_spdk_nvme_pid) { + return 0; + } + + if (nvme_qpair_is_admin_queue(qpair) && + req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + return 0; + } + + if (req->submit_tick + active_proc->timeout_ticks > now_tick) { + return 1; + } + + req->timed_out = true; + + /* + * We don't want to expose the admin queue to the user, + * so when we're timing out admin commands set the + * qpair to NULL. + */ + active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr, + nvme_qpair_is_admin_queue(qpair) ? NULL : qpair, + cid); + return 0; +} + +int +nvme_robust_mutex_init_shared(pthread_mutex_t *mtx) +{ + int rc = 0; + +#ifdef __FreeBSD__ + pthread_mutex_init(mtx, NULL); +#else + pthread_mutexattr_t attr; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); +#endif + + return rc; +} + +int +nvme_driver_init(void) +{ + int ret = 0; + /* Any socket ID */ + int socket_id = -1; + + /* Each process needs its own pid. */ + g_spdk_nvme_pid = getpid(); + + /* + * Only one thread from one process will do this driver init work. + * The primary process will reserve the shared memory and do the + * initialization. + * The secondary process will lookup the existing reserved memory. + */ + if (spdk_process_is_primary()) { + /* The unique named memzone already reserved. */ + if (g_spdk_nvme_driver != NULL) { + return 0; + } else { + g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME, + sizeof(struct nvme_driver), socket_id, + SPDK_MEMZONE_NO_IOVA_CONTIG); + } + + if (g_spdk_nvme_driver == NULL) { + SPDK_ERRLOG("primary process failed to reserve memory\n"); + + return -1; + } + } else { + g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME); + + /* The unique named memzone already reserved by the primary process. */ + if (g_spdk_nvme_driver != NULL) { + int ms_waited = 0; + + /* Wait the nvme driver to get initialized. */ + while ((g_spdk_nvme_driver->initialized == false) && + (ms_waited < g_nvme_driver_timeout_ms)) { + ms_waited++; + nvme_delay(1000); /* delay 1ms */ + } + if (g_spdk_nvme_driver->initialized == false) { + SPDK_ERRLOG("timeout waiting for primary process to init\n"); + + return -1; + } + } else { + SPDK_ERRLOG("primary process is not started yet\n"); + + return -1; + } + + return 0; + } + + /* + * At this moment, only one thread from the primary process will do + * the g_spdk_nvme_driver initialization + */ + assert(spdk_process_is_primary()); + + ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock); + if (ret != 0) { + SPDK_ERRLOG("failed to initialize mutex\n"); + spdk_memzone_free(SPDK_NVME_DRIVER_NAME); + return ret; + } + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + g_spdk_nvme_driver->initialized = false; + + TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs); + + spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id); + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ret; +} + +int +nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, void *devhandle, + spdk_nvme_probe_cb probe_cb, void *cb_ctx) +{ + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ctrlr_opts opts; + + assert(trid != NULL); + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + + if (!probe_cb || probe_cb(cb_ctx, trid, &opts)) { + ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle); + if (ctrlr == NULL) { + SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr); + return -1; + } + + TAILQ_INSERT_TAIL(&g_nvme_init_ctrlrs, ctrlr, tailq); + return 0; + } + + return 1; +} + +static int +nvme_init_controllers(void *cb_ctx, spdk_nvme_attach_cb attach_cb) +{ + int rc = 0; + int start_rc; + struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp; + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + /* Initialize all new controllers in the g_nvme_init_ctrlrs list in parallel. */ + while (!TAILQ_EMPTY(&g_nvme_init_ctrlrs)) { + TAILQ_FOREACH_SAFE(ctrlr, &g_nvme_init_ctrlrs, tailq, ctrlr_tmp) { + /* Drop the driver lock while calling nvme_ctrlr_process_init() + * since it needs to acquire the driver lock internally when initializing + * controller. + * + * TODO: Rethink the locking - maybe reset should take the lock so that start() and + * the functions it calls (in particular nvme_ctrlr_set_num_qpairs()) + * can assume it is held. + */ + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + start_rc = nvme_ctrlr_process_init(ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + if (start_rc) { + /* Controller failed to initialize. */ + TAILQ_REMOVE(&g_nvme_init_ctrlrs, ctrlr, tailq); + SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr); + nvme_ctrlr_destruct(ctrlr); + rc = -1; + break; + } + + if (ctrlr->state == NVME_CTRLR_STATE_READY) { + /* + * Controller has been initialized. + * Move it to the attached_ctrlrs list. + */ + TAILQ_REMOVE(&g_nvme_init_ctrlrs, ctrlr, tailq); + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + + /* + * Increase the ref count before calling attach_cb() as the user may + * call nvme_detach() immediately. + */ + nvme_ctrlr_proc_get_ref(ctrlr); + + /* + * Unlock while calling attach_cb() so the user can call other functions + * that may take the driver lock, like nvme_detach(). + */ + if (attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + attach_cb(cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + + break; + } + } + } + + g_spdk_nvme_driver->initialized = true; + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return rc; +} + +/* This function must not be called while holding g_spdk_nvme_driver->lock */ +static struct spdk_nvme_ctrlr * +spdk_nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(trid); + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ctrlr; +} + +/* This function must be called while holding g_spdk_nvme_driver->lock */ +struct spdk_nvme_ctrlr * +spdk_nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + /* Search per-process list */ + TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + /* Search multi-process shared list */ + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + return NULL; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +spdk_nvme_probe_internal(const struct spdk_nvme_transport_id *trid, void *cb_ctx, + spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb, struct spdk_nvme_ctrlr **connected_ctrlr) +{ + int rc; + struct spdk_nvme_ctrlr *ctrlr; + bool direct_connect = (connected_ctrlr != NULL); + + if (!spdk_nvme_transport_available(trid->trtype)) { + SPDK_ERRLOG("NVMe trtype %u not available\n", trid->trtype); + return -1; + } + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + nvme_transport_ctrlr_scan(trid, cb_ctx, probe_cb, remove_cb, direct_connect); + + /* + * Probe controllers on the shared_attached_ctrlrs list + */ + if (!spdk_process_is_primary() && (trid->trtype == SPDK_NVME_TRANSPORT_PCIE)) { + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + /* Do not attach other ctrlrs if user specify a valid trid */ + if ((strlen(trid->traddr) != 0) && + (spdk_nvme_transport_id_compare(trid, &ctrlr->trid))) { + continue; + } + + nvme_ctrlr_proc_get_ref(ctrlr); + + /* + * Unlock while calling attach_cb() so the user can call other functions + * that may take the driver lock, like nvme_detach(). + */ + if (attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + attach_cb(cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + rc = 0; + + goto exit; + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + /* + * Keep going even if one or more nvme_attach() calls failed, + * but maintain the value of rc to signal errors when we return. + */ + + rc = nvme_init_controllers(cb_ctx, attach_cb); + +exit: + if (connected_ctrlr) { + *connected_ctrlr = spdk_nvme_get_ctrlr_by_trid(trid); + } + + return rc; +} + +int +spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx, + spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + int rc; + struct spdk_nvme_transport_id trid_pcie; + + rc = nvme_driver_init(); + if (rc != 0) { + return rc; + } + + if (trid == NULL) { + memset(&trid_pcie, 0, sizeof(trid_pcie)); + trid_pcie.trtype = SPDK_NVME_TRANSPORT_PCIE; + trid = &trid_pcie; + } + + return spdk_nvme_probe_internal(trid, cb_ctx, probe_cb, attach_cb, remove_cb, NULL); +} + +static bool +spdk_nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct spdk_nvme_ctrlr_connect_opts *requested_opts = cb_ctx; + + assert(requested_opts->opts); + + assert(requested_opts->opts_size != 0); + + memcpy(opts, requested_opts->opts, spdk_min(sizeof(*opts), requested_opts->opts_size)); + + return true; +} + +struct spdk_nvme_ctrlr * +spdk_nvme_connect(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + int rc; + struct spdk_nvme_ctrlr_connect_opts connect_opts = {}; + struct spdk_nvme_ctrlr_connect_opts *user_connect_opts = NULL; + struct spdk_nvme_ctrlr *ctrlr = NULL; + spdk_nvme_probe_cb probe_cb = NULL; + + if (trid == NULL) { + SPDK_ERRLOG("No transport ID specified\n"); + return NULL; + } + + rc = nvme_driver_init(); + if (rc != 0) { + return NULL; + } + + if (opts && opts_size > 0) { + connect_opts.opts = opts; + connect_opts.opts_size = opts_size; + user_connect_opts = &connect_opts; + probe_cb = spdk_nvme_connect_probe_cb; + } + + spdk_nvme_probe_internal(trid, user_connect_opts, probe_cb, NULL, NULL, &ctrlr); + + return ctrlr; +} + +int +spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str) +{ + if (trtype == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "PCIe") == 0) { + *trtype = SPDK_NVME_TRANSPORT_PCIE; + } else if (strcasecmp(str, "RDMA") == 0) { + *trtype = SPDK_NVME_TRANSPORT_RDMA; + } else if (strcasecmp(str, "FC") == 0) { + *trtype = SPDK_NVME_TRANSPORT_FC; + } else { + return -ENOENT; + } + return 0; +} + +const char * +spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype) +{ + switch (trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + return "PCIe"; + case SPDK_NVME_TRANSPORT_RDMA: + return "RDMA"; + case SPDK_NVME_TRANSPORT_FC: + return "FC"; + default: + return NULL; + } +} + +int +spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str) +{ + if (adrfam == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "IPv4") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV4; + } else if (strcasecmp(str, "IPv6") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else if (strcasecmp(str, "IB") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IB; + } else if (strcasecmp(str, "FC") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_FC; + } else { + return -ENOENT; + } + return 0; +} + +const char * +spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam) +{ + switch (adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + return "IPv4"; + case SPDK_NVMF_ADRFAM_IPV6: + return "IPv6"; + case SPDK_NVMF_ADRFAM_IB: + return "IB"; + case SPDK_NVMF_ADRFAM_FC: + return "FC"; + default: + return NULL; + } +} + +int +spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str) +{ + const char *sep, *sep1; + const char *whitespace = " \t\n"; + size_t key_len, val_len; + char key[32]; + char val[1024]; + + if (trid == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + str += strspn(str, whitespace); + + sep = strchr(str, ':'); + if (!sep) { + sep = strchr(str, '='); + if (!sep) { + SPDK_ERRLOG("Key without ':' or '=' separator\n"); + return -EINVAL; + } + } else { + sep1 = strchr(str, '='); + if ((sep1 != NULL) && (sep1 < sep)) { + sep = sep1; + } + } + + key_len = sep - str; + if (key_len >= sizeof(key)) { + SPDK_ERRLOG("Transport key length %zu greater than maximum allowed %zu\n", + key_len, sizeof(key) - 1); + return -EINVAL; + } + + memcpy(key, str, key_len); + key[key_len] = '\0'; + + str += key_len + 1; /* Skip key: */ + val_len = strcspn(str, whitespace); + if (val_len == 0) { + SPDK_ERRLOG("Key without value\n"); + return -EINVAL; + } + + if (val_len >= sizeof(val)) { + SPDK_ERRLOG("Transport value length %zu greater than maximum allowed %zu\n", + val_len, sizeof(val) - 1); + return -EINVAL; + } + + memcpy(val, str, val_len); + val[val_len] = '\0'; + + str += val_len; + + if (strcasecmp(key, "trtype") == 0) { + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) { + SPDK_ERRLOG("Unknown trtype '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "adrfam") == 0) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) { + SPDK_ERRLOG("Unknown adrfam '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "traddr") == 0) { + if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) { + SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRADDR_MAX_LEN); + return -EINVAL; + } + memcpy(trid->traddr, val, val_len + 1); + } else if (strcasecmp(key, "trsvcid") == 0) { + if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) { + SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRSVCID_MAX_LEN); + return -EINVAL; + } + memcpy(trid->trsvcid, val, val_len + 1); + } else if (strcasecmp(key, "subnqn") == 0) { + if (val_len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_NQN_MAX_LEN); + return -EINVAL; + } + memcpy(trid->subnqn, val, val_len + 1); + } else { + SPDK_ERRLOG("Unknown transport ID key '%s'\n", key); + } + } + + return 0; +} + +static int +cmp_int(int a, int b) +{ + return a - b; +} + +int +spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1, + const struct spdk_nvme_transport_id *trid2) +{ + int cmp; + + cmp = cmp_int(trid1->trtype, trid2->trtype); + if (cmp) { + return cmp; + } + + if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_pci_addr pci_addr1; + struct spdk_pci_addr pci_addr2; + + /* Normalize PCI addresses before comparing */ + if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 || + spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) { + return -1; + } + + /* PCIe transport ID only uses trtype and traddr */ + return spdk_pci_addr_compare(&pci_addr1, &pci_addr2); + } + + cmp = strcasecmp(trid1->traddr, trid2->traddr); + if (cmp) { + return cmp; + } + + cmp = cmp_int(trid1->adrfam, trid2->adrfam); + if (cmp) { + return cmp; + } + + cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid); + if (cmp) { + return cmp; + } + + cmp = strcmp(trid1->subnqn, trid2->subnqn); + if (cmp) { + return cmp; + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME) diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c new file mode 100644 index 00000000..69ae0878 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr.c @@ -0,0 +1,2678 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvme_internal.h" + +#include "spdk/env.h" +#include "spdk/string.h" + +static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer); +static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns); +static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns); + +static int +nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + &cc->raw); +} + +static int +nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw), + &csts->raw); +} + +int +nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap) +{ + return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw), + &cap->raw); +} + +int +nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw), + &vs->raw); +} + +static int +nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc->raw); +} + +void +spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + char host_id_str[SPDK_UUID_STRING_LEN]; + + assert(opts); + + memset(opts, 0, opts_size); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(num_io_queues)) { + opts->num_io_queues = DEFAULT_MAX_IO_QUEUES; + } + + if (FIELD_OK(use_cmb_sqs)) { + opts->use_cmb_sqs = true; + } + + if (FIELD_OK(arb_mechanism)) { + opts->arb_mechanism = SPDK_NVME_CC_AMS_RR; + } + + if (FIELD_OK(keep_alive_timeout_ms)) { + opts->keep_alive_timeout_ms = 10 * 1000; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE; + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS; + } + + if (FIELD_OK(host_id)) { + memset(opts->host_id, 0, sizeof(opts->host_id)); + } + + if (nvme_driver_init() == 0) { + if (FIELD_OK(extended_host_id)) { + memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id, + sizeof(opts->extended_host_id)); + } + + if (FIELD_OK(hostnqn)) { + spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str), + &g_spdk_nvme_driver->default_extended_host_id); + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str); + } + } + + if (FIELD_OK(src_addr)) { + memset(opts->src_addr, 0, sizeof(opts->src_addr)); + } + + if (FIELD_OK(src_svcid)) { + memset(opts->src_svcid, 0, sizeof(opts->src_svcid)); + } + + if (FIELD_OK(command_set)) { + opts->command_set = SPDK_NVME_CC_CSS_NVM; + } +#undef FIELD_OK +} + +/** + * This function will be called when the process allocates the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq); + qpair->active_proc = active_proc; + } +} + +/** + * This function will be called when the process frees the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_qpair *active_qpair, *tmp_qpair; + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (!active_proc) { + return; + } + + TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs, + per_process_tailq, tmp_qpair) { + if (active_qpair == qpair) { + TAILQ_REMOVE(&active_proc->allocated_io_qpairs, + active_qpair, per_process_tailq); + + break; + } + } +} + +void +spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_io_qpair_opts *opts, + size_t opts_size) +{ + assert(ctrlr); + + assert(opts); + + memset(opts, 0, opts_size); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(qprio)) { + opts->qprio = SPDK_NVME_QPRIO_URGENT; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = ctrlr->opts.io_queue_size; + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = ctrlr->opts.io_queue_requests; + } + +#undef FIELD_OK +} + +struct spdk_nvme_qpair * +spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_io_qpair_opts *user_opts, + size_t opts_size) +{ + uint32_t qid; + struct spdk_nvme_qpair *qpair; + union spdk_nvme_cc_register cc; + struct spdk_nvme_io_qpair_opts opts; + + if (!ctrlr) { + return NULL; + } + + /* + * Get the default options, then overwrite them with the user-provided options + * up to opts_size. + * + * This allows for extensions of the opts structure without breaking + * ABI compatibility. + */ + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc failed\n"); + return NULL; + } + + /* Only the low 2 bits (values 0, 1, 2, 3) of QPRIO are valid. */ + if ((opts.qprio & 3) != opts.qprio) { + return NULL; + } + + /* + * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the + * default round robin arbitration method. + */ + if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts.qprio != SPDK_NVME_QPRIO_URGENT)) { + SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n"); + return NULL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + /* + * Get the first available I/O queue ID. + */ + qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1); + if (qid > ctrlr->opts.num_io_queues) { + SPDK_ERRLOG("No free I/O queue IDs\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, &opts); + if (qpair == NULL) { + SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + spdk_bit_array_clear(ctrlr->free_io_qids, qid); + TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq); + + nvme_ctrlr_proc_add_io_qpair(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) { + spdk_delay_us(100); + } + + return qpair; +} + +int +spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr; + + if (qpair == NULL) { + return 0; + } + + ctrlr = qpair->ctrlr; + + if (qpair->in_completion_context) { + /* + * There are many cases where it is convenient to delete an io qpair in the context + * of that qpair's completion routine. To handle this properly, set a flag here + * so that the completion routine will perform an actual delete after the context + * unwinds. + */ + qpair->delete_after_completion_context = 1; + return 0; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_proc_remove_io_qpair(qpair); + + TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq); + spdk_bit_array_set(ctrlr->free_io_qids, qpair->id); + + if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -1; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return 0; +} + +static void +nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_intel_log_page_directory *log_page_directory) +{ + if (log_page_directory == NULL) { + return; + } + + if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) { + return; + } + + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true; + + if (log_page_directory->read_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true; + } + if (log_page_directory->write_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true; + } + if (log_page_directory->temperature_statistics_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true; + } + if (log_page_directory->smart_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true; + } + if (log_page_directory->marketing_description_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true; + } +} + +static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint64_t phys_addr = 0; + struct nvme_completion_poll_status status; + struct spdk_nvme_intel_log_page_directory *log_page_directory; + + log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory), + 64, &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (log_page_directory == NULL) { + SPDK_ERRLOG("could not allocate log_page_directory\n"); + return -ENXIO; + } + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY, + SPDK_NVME_GLOBAL_NS_TAG, log_page_directory, + sizeof(struct spdk_nvme_intel_log_page_directory), + 0, nvme_completion_poll_cb, &status); + if (rc != 0) { + spdk_free(log_page_directory); + return rc; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + spdk_free(log_page_directory); + SPDK_ERRLOG("nvme_ctrlr_cmd_get_log_page failed!\n"); + return -ENXIO; + } + + nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory); + spdk_free(log_page_directory); + return 0; +} + +static int +nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + + memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported)); + /* Mandatory pages */ + ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true; + if (ctrlr->cdata.lpa.celp) { + ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) { + rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr); + } + + return rc; +} + +static void +nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true; +} + +static void +nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported)); + /* Mandatory features */ + ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true; + /* Optional features */ + if (ctrlr->cdata.vwc.present) { + ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true; + } + if (ctrlr->cdata.apsta.supported) { + ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true; + } + if (ctrlr->cdata.hmpre) { + ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) { + nvme_ctrlr_set_intel_supported_features(ctrlr); + } +} + +void +nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove) +{ + /* + * Set the flag here and leave the work failure of qpairs to + * spdk_nvme_qpair_process_completions(). + */ + if (hot_remove) { + ctrlr->is_removed = true; + } + ctrlr->is_failed = true; + SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr); +} + +static void +nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ms_waited = 0; + uint32_t shutdown_timeout_ms; + + if (ctrlr->is_removed) { + return; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return; + } + + cc.bits.shn = SPDK_NVME_SHN_NORMAL; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return; + } + + /* + * The NVMe specification defines RTD3E to be the time between + * setting SHN = 1 until the controller will set SHST = 10b. + * If the device doesn't report RTD3 entry latency, or if it + * reports RTD3 entry latency less than 10 seconds, pick + * 10 seconds as a reasonable amount of time to + * wait before proceeding. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e); + shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000; + shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms); + + do { + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + SPDK_ERRLOG("get_csts() failed\n"); + return; + } + + if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown complete in %u milliseconds\n", + ms_waited); + return; + } + + nvme_delay(1000); + ms_waited++; + } while (ms_waited < shutdown_timeout_ms); + + SPDK_ERRLOG("did not shutdown within %u milliseconds\n", shutdown_timeout_ms); +} + +static int +nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + int rc; + + rc = nvme_transport_ctrlr_enable(ctrlr); + if (rc != 0) { + SPDK_ERRLOG("transport ctrlr_enable failed\n"); + return rc; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return -EIO; + } + + if (cc.bits.en != 0) { + SPDK_ERRLOG("%s called with CC.EN = 1\n", __func__); + return -EINVAL; + } + + cc.bits.en = 1; + cc.bits.css = 0; + cc.bits.shn = 0; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + + /* Page size is 2 ^ (12 + mps). */ + cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12; + + if (ctrlr->cap.bits.css == 0) { + SPDK_INFOLOG(SPDK_LOG_NVME, + "Drive reports no command sets supported. Assuming NVM is supported.\n"); + ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + } + + if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n", + ctrlr->opts.command_set, ctrlr->cap.bits.css); + return -EINVAL; + } + + cc.bits.css = ctrlr->opts.command_set; + + switch (ctrlr->opts.arb_mechanism) { + case SPDK_NVME_CC_AMS_RR: + break; + case SPDK_NVME_CC_AMS_WRR: + if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + case SPDK_NVME_CC_AMS_VS: + if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + default: + return -EINVAL; + } + + cc.bits.ams = ctrlr->opts.arb_mechanism; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + + return 0; +} + +#ifdef DEBUG +static const char * +nvme_ctrlr_state_string(enum nvme_ctrlr_state state) +{ + switch (state) { + case NVME_CTRLR_STATE_INIT_DELAY: + return "delay init"; + case NVME_CTRLR_STATE_INIT: + return "init"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + return "disable and wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + return "disable and wait for CSTS.RDY = 0"; + case NVME_CTRLR_STATE_ENABLE: + return "enable controller by writing CC.EN = 1"; + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + return "wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE: + return "enable admin queue"; + case NVME_CTRLR_STATE_IDENTIFY: + return "identify controller"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + return "wait for identify controller"; + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + return "set number of queues"; + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + return "wait for set number of queues"; + case NVME_CTRLR_STATE_GET_NUM_QUEUES: + return "get number of queues"; + case NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES: + return "wait for get number of queues"; + case NVME_CTRLR_STATE_CONSTRUCT_NS: + return "construct namespaces"; + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + return "identify active ns"; + case NVME_CTRLR_STATE_IDENTIFY_NS: + return "identify ns"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + return "wait for identify ns"; + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + return "identify namespace id descriptors"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + return "wait for identify namespace id descriptors"; + case NVME_CTRLR_STATE_CONFIGURE_AER: + return "configure AER"; + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + return "wait for configure aer"; + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + return "set supported log pages"; + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + return "set supported features"; + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + return "set doorbell buffer config"; + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + return "wait for doorbell buffer config"; + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + return "set keep alive timeout"; + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + return "wait for set keep alive timeout"; + case NVME_CTRLR_STATE_SET_HOST_ID: + return "set host ID"; + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + return "wait for set host ID"; + case NVME_CTRLR_STATE_READY: + return "ready"; + case NVME_CTRLR_STATE_ERROR: + return "error"; + } + return "unknown"; +}; +#endif /* DEBUG */ + +static void +nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state, + uint64_t timeout_in_ms) +{ + ctrlr->state = state; + if (timeout_in_ms == NVME_TIMEOUT_INFINITE) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n", + nvme_ctrlr_state_string(ctrlr->state)); + ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE; + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n", + nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms); + ctrlr->state_timeout_tsc = spdk_get_ticks() + (timeout_in_ms * spdk_get_ticks_hz()) / 1000; + } +} + +static void +nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->shadow_doorbell) { + spdk_dma_free(ctrlr->shadow_doorbell); + ctrlr->shadow_doorbell = NULL; + } + + if (ctrlr->eventidx) { + spdk_dma_free(ctrlr->eventidx); + ctrlr->eventidx = NULL; + } +} + +static void +nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("Doorbell buffer config failed\n"); + } else { + SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n", + ctrlr->trid.traddr); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint64_t prp1, prp2; + + if (!ctrlr->cdata.oacs.doorbell_buffer_config) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE); + return 0; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE); + return 0; + } + + /* only 1 page size for doorbell buffer */ + ctrlr->shadow_doorbell = spdk_dma_zmalloc(ctrlr->page_size, ctrlr->page_size, + &prp1); + if (ctrlr->shadow_doorbell == NULL) { + rc = -ENOMEM; + goto error; + } + + ctrlr->eventidx = spdk_dma_zmalloc(ctrlr->page_size, ctrlr->page_size, &prp2); + if (ctrlr->eventidx == NULL) { + rc = -ENOMEM; + goto error; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, NVME_TIMEOUT_INFINITE); + + rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2, + nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr); + if (rc != 0) { + goto error; + } + + return 0; + +error: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + nvme_ctrlr_free_doorbell_buffer(ctrlr); + return rc; +} + +int +spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + struct spdk_nvme_qpair *qpair; + struct nvme_request *req, *tmp; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->is_resetting || ctrlr->is_failed) { + /* + * Controller is already resetting or has failed. Return + * immediately since there is no need to kick off another + * reset in these cases. + */ + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return 0; + } + + ctrlr->is_resetting = true; + + SPDK_NOTICELOG("resetting controller\n"); + + /* Free all of the queued abort requests */ + STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + nvme_free_request(req); + ctrlr->outstanding_aborts--; + } + + /* Disable all queues before disabling the controller hardware. */ + nvme_qpair_disable(ctrlr->adminq); + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + nvme_qpair_disable(qpair); + } + + /* Doorbell buffer config is invalid during reset */ + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + /* Set the state back to INIT to cause a full hardware reset. */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + + while (ctrlr->state != NVME_CTRLR_STATE_READY) { + if (nvme_ctrlr_process_init(ctrlr) != 0) { + SPDK_ERRLOG("%s: controller reinitialization failed\n", __func__); + nvme_ctrlr_fail(ctrlr, false); + rc = -1; + break; + } + } + + if (!ctrlr->is_failed) { + /* Reinitialize qpairs */ + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + if (nvme_transport_ctrlr_reinit_io_qpair(ctrlr, qpair) != 0) { + nvme_ctrlr_fail(ctrlr, false); + rc = -1; + } + } + } + + ctrlr->is_resetting = false; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +static void +nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* + * Use MDTS to ensure our default max_xfer_size doesn't exceed what the + * controller supports. + */ + ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size); + if (ctrlr->cdata.mdts > 0) { + ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size, + ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid); + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + ctrlr->cntlid = ctrlr->cdata.cntlid; + } else { + /* + * Fabrics controllers should already have CNTLID from the Connect command. + * + * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data, + * trust the one from Connect. + */ + if (ctrlr->cntlid != ctrlr->cdata.cntlid) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n", + ctrlr->cdata.cntlid, ctrlr->cntlid); + } + } + + if (ctrlr->cdata.sgls.supported) { + ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED; + ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, NVME_TIMEOUT_INFINITE); + + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &ctrlr->cdata, sizeof(ctrlr->cdata), + nvme_ctrlr_identify_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +int +nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_completion_poll_status status; + int rc; + uint32_t i; + uint32_t num_pages; + uint32_t next_nsid = 0; + uint32_t *new_ns_list = NULL; + + + /* + * The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list) + */ + num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1; + new_ns_list = spdk_dma_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size, + NULL); + if (!new_ns_list) { + SPDK_ERRLOG("Failed to allocate active_ns_list!\n"); + return -ENOMEM; + } + + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 1, 0) && !(ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + /* + * Iterate through the pages and fetch each chunk of 1024 namespaces until + * there are no more active namespaces + */ + for (i = 0; i < num_pages; i++) { + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, next_nsid, + &new_ns_list[1024 * i], sizeof(struct spdk_nvme_ns_list), + nvme_completion_poll_cb, &status); + if (rc != 0) { + goto fail; + } + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + SPDK_ERRLOG("nvme_ctrlr_cmd_identify_active_ns_list failed!\n"); + rc = -ENXIO; + goto fail; + } + next_nsid = new_ns_list[1024 * i + 1023]; + if (next_nsid == 0) { + /* + * No more active namespaces found, no need to fetch additional chunks + */ + break; + } + } + + } else { + /* + * Controller doesn't support active ns list CNS 0x02 so dummy up + * an active ns list + */ + for (i = 0; i < ctrlr->num_ns; i++) { + new_ns_list[i] = i + 1; + } + } + + /* + * Now that that the list is properly setup, we can swap it in to the ctrlr and + * free up the previous one. + */ + spdk_dma_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = new_ns_list; + + return 0; +fail: + spdk_dma_free(new_ns_list); + return rc; +} + +static void +nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } else { + nvme_ns_set_identify_data(ns); + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, NVME_TIMEOUT_INFINITE); + return; + } + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + struct spdk_nvme_ns_data *nsdata; + + nsdata = &ctrlr->nsdata[ns->id - 1]; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, NVME_TIMEOUT_INFINITE); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_ctrlr_identify_ns_async_done, ns); +} + +static int +nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + return 0; + } + + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + return; + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + return; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, NVME_TIMEOUT_INFINITE); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, + 0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_ctrlr_identify_id_desc_async_done, ns); +} + +static int +nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + return 0; + } + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + return 0; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Set Features - Number of Queues failed!\n"); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_GET_NUM_QUEUES, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) { + SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n", + ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES); + ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES; + } else if (ctrlr->opts.num_io_queues < 1) { + SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n"); + ctrlr->opts.num_io_queues = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, NVME_TIMEOUT_INFINITE); + + rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues, + nvme_ctrlr_set_num_queues_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_get_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t cq_allocated, sq_allocated, min_allocated, i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Get Features - Number of Queues failed!\n"); + ctrlr->opts.num_io_queues = 0; + } else { + /* + * Data in cdw0 is 0-based. + * Lower 16-bits indicate number of submission queues allocated. + * Upper 16-bits indicate number of completion queues allocated. + */ + sq_allocated = (cpl->cdw0 & 0xFFFF) + 1; + cq_allocated = (cpl->cdw0 >> 16) + 1; + + /* + * For 1:1 queue mapping, set number of allocated queues to be minimum of + * submission and completion queues. + */ + min_allocated = spdk_min(sq_allocated, cq_allocated); + + /* Set number of queues to be minimum of requested and actually allocated. */ + ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues); + } + + ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1); + if (ctrlr->free_io_qids == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */ + spdk_bit_array_clear(ctrlr->free_io_qids, 0); + for (i = 1; i <= ctrlr->opts.num_io_queues; i++) { + spdk_bit_array_set(ctrlr->free_io_qids, i); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_get_num_queues(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES, NVME_TIMEOUT_INFINITE); + + /* Obtain the number of queues allocated using Get Features. */ + rc = nvme_ctrlr_cmd_get_num_queues(ctrlr, nvme_ctrlr_get_num_queues_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t keep_alive_interval_ms; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n", + cpl->status.sc, cpl->status.sct); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n", + cpl->cdw0); + } + + ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0; + + keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2; + if (keep_alive_interval_ms == 0) { + keep_alive_interval_ms = 1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms); + + ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000); + + /* Schedule the first Keep Alive to be sent as soon as possible. */ + ctrlr->next_keep_alive_tick = spdk_get_ticks(); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.keep_alive_timeout_ms == 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE); + return 0; + } + + if (ctrlr->cdata.kas == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n"); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE); + return 0; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE); + + /* Retrieve actual keep alive timeout, since the controller may have adjusted it. */ + rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0, + nvme_ctrlr_set_keep_alive_timeout_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + /* + * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature + * is optional. + */ + SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n", + cpl->status.sc, cpl->status.sct); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr) +{ + uint8_t *host_id; + uint32_t host_id_size; + int rc; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + /* + * NVMe-oF sends the host ID during Connect and doesn't allow + * Set Features - Host Identifier after Connect, so we don't need to do anything here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + if (ctrlr->cdata.ctratt.host_id_exhid_supported) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n"); + host_id = ctrlr->opts.extended_host_id; + host_id_size = sizeof(ctrlr->opts.extended_host_id); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n"); + host_id = ctrlr->opts.host_id; + host_id_size = sizeof(ctrlr->opts.host_id); + } + + /* If the user specified an all-zeroes host identifier, don't send the command. */ + if (spdk_mem_all_zero(host_id, host_id_size)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "User did not specify host ID - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + SPDK_TRACEDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, NVME_TIMEOUT_INFINITE); + + rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->ns) { + uint32_t i, num_ns = ctrlr->num_ns; + + for (i = 0; i < num_ns; i++) { + nvme_ns_destruct(&ctrlr->ns[i]); + } + + spdk_free(ctrlr->ns); + ctrlr->ns = NULL; + ctrlr->num_ns = 0; + } + + if (ctrlr->nsdata) { + spdk_free(ctrlr->nsdata); + ctrlr->nsdata = NULL; + } + + spdk_dma_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = NULL; +} + +static void +nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t i, nn = ctrlr->cdata.nn; + struct spdk_nvme_ns_data *nsdata; + + for (i = 0; i < nn; i++) { + struct spdk_nvme_ns *ns = &ctrlr->ns[i]; + uint32_t nsid = i + 1; + nsdata = &ctrlr->nsdata[nsid - 1]; + + if ((nsdata->ncap == 0) && spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) { + if (nvme_ns_construct(ns, nsid, ctrlr) != 0) { + continue; + } + } + + if (nsdata->ncap && !spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) { + nvme_ns_destruct(ns); + } + } +} + +static int +nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint32_t nn = ctrlr->cdata.nn; + uint64_t phys_addr = 0; + + /* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset), + * so check if we need to reallocate. + */ + if (nn != ctrlr->num_ns) { + nvme_ctrlr_destruct_namespaces(ctrlr); + + if (nn == 0) { + SPDK_WARNLOG("controller has 0 namespaces\n"); + return 0; + } + + ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64, + &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr->ns == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64, + &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA); + if (ctrlr->nsdata == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->num_ns = nn; + } + + return 0; + +fail: + nvme_ctrlr_destruct_namespaces(ctrlr); + return rc; +} + +static void +nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer = arg; + struct spdk_nvme_ctrlr *ctrlr = aer->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + union spdk_nvme_async_event_completion event; + int rc; + + if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && + cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) { + /* + * This is simulated when controller is being shut down, to + * effectively abort outstanding asynchronous event requests + * and make sure all memory is freed. Do not repost the + * request in this case. + */ + return; + } + + if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) { + /* + * SPDK will only send as many AERs as the device says it supports, + * so this status code indicates an out-of-spec device. Do not repost + * the request in this case. + */ + SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n" + "handling. Do not repost this AER.\n"); + return; + } + + event.raw = cpl->cdw0; + if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && + (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { + rc = nvme_ctrlr_identify_active_ns(ctrlr); + if (rc) { + return; + } + nvme_ctrlr_update_namespaces(ctrlr); + } + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc && active_proc->aer_cb_fn) { + active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl); + } + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) { + /* + * We can't do anything to recover from a failure here, + * so just print a warning message and leave the AER unsubmitted. + */ + SPDK_ERRLOG("resubmitting AER failed!\n"); + } +} + +static int +nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer) +{ + struct nvme_request *req; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer); + aer->req = req; + if (req == NULL) { + return -1; + } + + req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static void +nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer; + int rc; + uint32_t i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, NVME_TIMEOUT_INFINITE); + return; + } + + /* aerl is a zero-based value, so we need to add 1 here. */ + ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1)); + + for (i = 0; i < ctrlr->num_aers; i++) { + aer = &ctrlr->aer[i]; + rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + if (rc) { + SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_feat_async_event_configuration config; + int rc; + + config.raw = 0; + config.bits.crit_warn.bits.available_spare = 1; + config.bits.crit_warn.bits.temperature = 1; + config.bits.crit_warn.bits.device_reliability = 1; + config.bits.crit_warn.bits.read_only = 1; + config.bits.crit_warn.bits.volatile_memory_backup = 1; + + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) { + if (ctrlr->cdata.oaes.ns_attribute_notices) { + config.bits.ns_attr_notice = 1; + } + if (ctrlr->cdata.oaes.fw_activation_notices) { + config.bits.fw_activation_notice = 1; + } + } + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) { + config.bits.telemetry_log_notice = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + + rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config, + nvme_ctrlr_configure_aer_done, + ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +struct spdk_nvme_ctrlr_process * +spdk_nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + if (active_proc->pid == pid) { + return active_proc; + } + } + + return NULL; +} + +struct spdk_nvme_ctrlr_process * +spdk_nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr) +{ + return spdk_nvme_ctrlr_get_process(ctrlr, getpid()); +} + +/** + * This function will be called when a process is using the controller. + * 1. For the primary process, it is called when constructing the controller. + * 2. For the secondary process, it is called at probing the controller. + * Note: will check whether the process is already added for the same process. + */ +int +nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle) +{ + struct spdk_nvme_ctrlr_process *ctrlr_proc; + pid_t pid = getpid(); + + /* Check whether the process is already added or not */ + if (spdk_nvme_ctrlr_get_process(ctrlr, pid)) { + return 0; + } + + /* Initialize the per process properties for this ctrlr */ + ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process), + 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr_proc == NULL) { + SPDK_ERRLOG("failed to allocate memory to track the process props\n"); + + return -1; + } + + ctrlr_proc->is_primary = spdk_process_is_primary(); + ctrlr_proc->pid = pid; + STAILQ_INIT(&ctrlr_proc->active_reqs); + ctrlr_proc->devhandle = devhandle; + ctrlr_proc->ref = 0; + TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs); + + TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq); + + return 0; +} + +/** + * This function will be called when the process detaches the controller. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_ctrlr_process *proc) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + assert(STAILQ_EMPTY(&proc->active_reqs)); + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq); + + spdk_dma_free(proc); +} + +/** + * This function will be called when the process exited unexpectedly + * in order to free any incomplete nvme request, allocated IO qpairs + * and allocated memory. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc) +{ + struct nvme_request *req, *tmp_req; + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == proc->pid); + + nvme_free_request(req); + } + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq); + + /* + * The process may have been killed while some qpairs were in their + * completion context. Clear that flag here to allow these IO + * qpairs to be deleted. + */ + qpair->in_completion_context = 0; + + qpair->no_deletion_notification_needed = 1; + + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + spdk_dma_free(proc); +} + +/** + * This function will be called when destructing the controller. + * 1. There is no more admin request on this controller. + * 2. Clean up any left resource allocation when its associated process is gone. + */ +void +nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + + /* Free all the processes' properties and make sure no pending admin IOs */ + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + assert(STAILQ_EMPTY(&active_proc->active_reqs)); + + spdk_free(active_proc); + } +} + +/** + * This function will be called when any other process attaches or + * detaches the controller in order to cleanup those unexpectedly + * terminated processes. + * Note: the ctrlr_lock must be held when calling this function. + */ +static int +nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + int active_proc_count = 0; + + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) { + SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid); + + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + nvme_ctrlr_cleanup_process(active_proc); + } else { + active_proc_count++; + } + } + + return active_proc_count; +} + +void +nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref++; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int proc_count; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref--; + assert(active_proc->ref >= 0); + + /* + * The last active process will be removed at the end of + * the destruction of the controller. + */ + if (active_proc->ref == 0 && proc_count != 1) { + nvme_ctrlr_remove_process(ctrlr, active_proc); + } + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +int +nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int ref = 0; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + ref += active_proc->ref; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return ref; +} + +/** + * Get the PCI device handle which is only visible to its associated process. + */ +struct spdk_pci_device * +nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_pci_device *devhandle = NULL; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + devhandle = active_proc->devhandle; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return devhandle; +} + +static void +nvme_ctrlr_enable_admin_queue(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_transport_qpair_reset(ctrlr->adminq); + nvme_qpair_enable(ctrlr->adminq); +} + +/** + * This function will be called repeatedly during initialization until the controller is ready. + */ +int +nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ready_timeout_in_ms; + int rc = 0; + + /* + * May need to avoid accessing any register on the target controller + * for a while. Return early without touching the FSM. + * Check sleep_timeout_tsc > 0 for unit test. + */ + if ((ctrlr->sleep_timeout_tsc > 0) && + (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) { + return 0; + } + ctrlr->sleep_timeout_tsc = 0; + + if (nvme_ctrlr_get_cc(ctrlr, &cc) || + nvme_ctrlr_get_csts(ctrlr, &csts)) { + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) { + /* While a device is resetting, it may be unable to service MMIO reads + * temporarily. Allow for this case. + */ + SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n"); + goto init_timeout; + } + SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state); + nvme_ctrlr_fail(ctrlr, false); + return -EIO; + } + + ready_timeout_in_ms = 500 * ctrlr->cap.bits.to; + + /* + * Check if the current initialization step is done or has timed out. + */ + switch (ctrlr->state) { + case NVME_CTRLR_STATE_INIT_DELAY: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms); + /* + * Controller may need some delay before it's enabled. + * + * This is a workaround for an issue where the PCIe-attached NVMe controller + * is not ready after VFIO reset. We delay the initialization rather than the + * enabling itself, because this is required only for the very first enabling + * - directly after a VFIO reset. + * + * TODO: Figure out what is actually going wrong. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000); + break; + + case NVME_CTRLR_STATE_INIT: + /* Begin the hardware initialization by making sure the controller is disabled. */ + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n"); + /* + * Controller is currently enabled. We need to disable it to cause a reset. + * + * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready. + * Wait for the ready bit to be 1 before disabling the controller. + */ + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return 0; + } + + /* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + nvme_ctrlr_fail(ctrlr, false); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + + /* + * Wait 2.5 seconds before accessing PCI registers. + * Not using sleep() to avoid blocking other controller's initialization. + */ + if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000); + } + return 0; + } else { + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n"); + /* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + nvme_ctrlr_fail(ctrlr, false); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms); + /* + * Delay 100us before setting CC.EN = 1. Some NVMe SSDs miss CC.EN getting + * set to 1 if it is too soon after CSTS.RDY is reported as 0. + */ + spdk_delay_us(100); + return 0; + } + break; + + case NVME_CTRLR_STATE_ENABLE: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n"); + rc = nvme_ctrlr_enable(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return rc; + + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n"); + /* + * The controller has been enabled. + * Perform the rest of initialization serially. + */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE, NVME_TIMEOUT_INFINITE); + return 0; + } + break; + + case NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE: + nvme_ctrlr_enable_admin_queue(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_IDENTIFY: + rc = nvme_ctrlr_identify(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + rc = nvme_ctrlr_set_num_queues(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_GET_NUM_QUEUES: + rc = nvme_ctrlr_get_num_queues(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONSTRUCT_NS: + rc = nvme_ctrlr_construct_namespaces(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + rc = nvme_ctrlr_identify_active_ns(ctrlr); + if (rc < 0) { + nvme_ctrlr_destruct_namespaces(ctrlr); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_IDENTIFY_NS: + rc = nvme_ctrlr_identify_namespaces(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONFIGURE_AER: + rc = nvme_ctrlr_configure_aer(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + rc = nvme_ctrlr_set_supported_log_pages(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + nvme_ctrlr_set_supported_features(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG, NVME_TIMEOUT_INFINITE); + break; + + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_HOST_ID: + rc = nvme_ctrlr_set_host_id(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_READY: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n"); + return 0; + + case NVME_CTRLR_STATE_ERROR: + SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr); + return -1; + + default: + assert(0); + nvme_ctrlr_fail(ctrlr, false); + return -1; + } + +init_timeout: + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE && + spdk_get_ticks() > ctrlr->state_timeout_tsc) { + SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state); + nvme_ctrlr_fail(ctrlr, false); + return -1; + } + + return rc; +} + +int +nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx) +{ + pthread_mutexattr_t attr; + int rc = 0; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) || +#ifndef __FreeBSD__ + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || +#endif + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); + return rc; +} + +int +nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE); + } else { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + } + + ctrlr->flags = 0; + ctrlr->free_io_qids = NULL; + ctrlr->is_resetting = false; + ctrlr->is_failed = false; + + TAILQ_INIT(&ctrlr->active_io_qpairs); + STAILQ_INIT(&ctrlr->queued_aborts); + ctrlr->outstanding_aborts = 0; + + rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock); + if (rc != 0) { + return rc; + } + + TAILQ_INIT(&ctrlr->active_procs); + + return rc; +} + +/* This function should be called once at ctrlr initialization to set up constant properties. */ +void +nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs) +{ + ctrlr->cap = *cap; + ctrlr->vs = *vs; + + ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin); + + /* For now, always select page_size == min_page_size. */ + ctrlr->page_size = ctrlr->min_page_size; + + ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u); + + ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size); +} + +void +nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr) +{ + pthread_mutex_destroy(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_qpair *qpair, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr); + TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + nvme_ctrlr_shutdown(ctrlr); + + nvme_ctrlr_destruct_namespaces(ctrlr); + + spdk_bit_array_free(&ctrlr->free_io_qids); + + nvme_transport_ctrlr_destruct(ctrlr); +} + +int +nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + return nvme_qpair_submit_request(ctrlr->adminq, req); +} + +static void +nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl) +{ + /* Do nothing */ +} + +/* + * Check if we need to send a Keep Alive command. + * Caller must hold ctrlr->ctrlr_lock. + */ +static void +nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr) +{ + uint64_t now; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + now = spdk_get_ticks(); + if (now < ctrlr->next_keep_alive_tick) { + return; + } + + req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL); + if (req == NULL) { + return; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + if (rc != 0) { + SPDK_ERRLOG("Submitting Keep Alive failed\n"); + } + + ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks; +} + +int32_t +spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr) +{ + int32_t num_completions; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + if (ctrlr->keep_alive_interval_ticks) { + nvme_ctrlr_keep_alive(ctrlr); + } + num_completions = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return num_completions; +} + +const struct spdk_nvme_ctrlr_data * +spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr) +{ + return &ctrlr->cdata; +} + +union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_csts_register csts; + + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + csts.raw = 0xFFFFFFFFu; + } + return csts; +} + +union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->cap; +} + +union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->vs; +} + +uint32_t +spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->num_ns; +} + +static int32_t +spdk_nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int32_t result = -1; + + if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) { + return result; + } + + int32_t lower = 0; + int32_t upper = ctrlr->num_ns - 1; + int32_t mid; + + while (lower <= upper) { + mid = lower + (upper - lower) / 2; + if (ctrlr->active_ns_list[mid] == nsid) { + result = mid; + break; + } else { + if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) { + lower = mid + 1; + } else { + upper = mid - 1; + } + + } + } + + return result; +} + +bool +spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + return spdk_nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1; +} + +uint32_t +spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0; +} + +uint32_t +spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid) +{ + int32_t nsid_idx = spdk_nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid); + if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) { + return ctrlr->active_ns_list[nsid_idx + 1]; + } + return 0; +} + +struct spdk_nvme_ns * +spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + if (nsid < 1 || nsid > ctrlr->num_ns) { + return NULL; + } + + return &ctrlr->ns[nsid - 1]; +} + +struct spdk_pci_device * +spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + return NULL; + } + + return nvme_ctrlr_proc_get_devhandle(ctrlr); +} + +uint32_t +spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->max_xfer_size; +} + +void +spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_aer_cb aer_cb_fn, + void *aer_cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->aer_cb_fn = aer_cb_fn; + active_proc->aer_cb_arg = aer_cb_arg; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr, + uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL; + active_proc->timeout_cb_fn = cb_fn; + active_proc->timeout_cb_arg = cb_arg; + } + + ctrlr->timeout_enabled = true; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +bool +spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page) +{ + /* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch"); + return ctrlr->log_page_supported[log_page]; +} + +bool +spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code) +{ + /* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch"); + return ctrlr->feature_supported[feature_code]; +} + +int +spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status status; + int res; + struct spdk_nvme_ns *ns; + + res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, &status); + if (res) { + return res; + } + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n"); + return -ENXIO; + } + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + return nvme_ns_construct(ns, nsid, ctrlr); +} + +int +spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status status; + int res; + struct spdk_nvme_ns *ns; + + res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, &status); + if (res) { + return res; + } + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n"); + return -ENXIO; + } + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + /* Inactive NS */ + nvme_ns_destruct(ns); + + return 0; +} + +uint32_t +spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload) +{ + struct nvme_completion_poll_status status; + int res; + uint32_t nsid; + struct spdk_nvme_ns *ns; + + res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, &status); + if (res) { + return 0; + } + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n"); + return 0; + } + + nsid = status.cpl.cdw0; + ns = &ctrlr->ns[nsid - 1]; + /* Inactive NS */ + res = nvme_ns_construct(ns, nsid, ctrlr); + if (res) { + return 0; + } + + /* Return the namespace ID that was created */ + return nsid; +} + +int +spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + struct nvme_completion_poll_status status; + int res; + struct spdk_nvme_ns *ns; + + res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, &status); + if (res) { + return res; + } + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n"); + return -ENXIO; + } + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + nvme_ns_destruct(ns); + + return 0; +} + +int +spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format) +{ + struct nvme_completion_poll_status status; + int res; + + res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb, + &status); + if (res) { + return res; + } + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n"); + return -ENXIO; + } + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +int +spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size, + int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status) +{ + struct spdk_nvme_fw_commit fw_commit; + struct nvme_completion_poll_status status; + int res; + unsigned int size_remaining; + unsigned int offset; + unsigned int transfer; + void *p; + + if (!completion_status) { + return -EINVAL; + } + memset(completion_status, 0, sizeof(struct spdk_nvme_status)); + if (size % 4) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n"); + return -1; + } + + /* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG + * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG + */ + if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) && + (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n"); + return -1; + } + + /* Firmware download */ + size_remaining = size; + offset = 0; + p = payload; + + while (size_remaining > 0) { + transfer = spdk_min(size_remaining, ctrlr->min_page_size); + + res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p, + nvme_completion_poll_cb, + &status); + if (res) { + return res; + } + + if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n"); + return -ENXIO; + } + p += transfer; + offset += transfer; + size_remaining -= transfer; + } + + /* Firmware commit */ + memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); + fw_commit.fs = slot; + fw_commit.ca = commit_action; + + res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb, + &status); + if (res) { + return res; + } + + res = spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock); + + memcpy(completion_status, &status.cpl.status, sizeof(struct spdk_nvme_status)); + + if (res) { + if (status.cpl.status.sct != SPDK_NVME_SCT_COMMAND_SPECIFIC || + status.cpl.status.sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) { + if (status.cpl.status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + status.cpl.status.sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) { + SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n"); + } else { + SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n"); + } + return -ENXIO; + } + } + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +void * +spdk_nvme_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) +{ + void *buf; + + if (size == 0) { + return NULL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + buf = nvme_transport_ctrlr_alloc_cmb_io_buffer(ctrlr, size); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return buf; +} + +void +spdk_nvme_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) +{ + if (buf && size) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_free_cmb_io_buffer(ctrlr, buf, size); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c new file mode 100644 index 00000000..750a2d78 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c @@ -0,0 +1,694 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +int +spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg); + + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, void *md_buf, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buf, md_buf); + + req = nvme_allocate_request(qpair, &payload, len, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_IDENTIFY; + cmd->cdw10 = cns | ((uint32_t)cntid << 16); + cmd->nsid = nsid; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +int +nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10 = SPDK_NVME_NS_CTRLR_ATTACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10 = SPDK_NVME_NS_CTRLR_DETACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ns_data), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10 = SPDK_NVME_NS_MANAGEMENT_CREATE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10 = SPDK_NVME_NS_MANAGEMENT_DELETE; + cmd->nsid = nsid; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG; + cmd->dptr.prp.prp1 = prp1; + cmd->dptr.prp.prp2 = prp2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FORMAT_NVM; + cmd->nsid = nsid; + memcpy(&cmd->cdw10, format, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10 = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10 = feature; + cmd->cdw11 = cdw11; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10 = feature; + cmd->cdw11 = cdw11; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10 = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + uint32_t cdw11; + + cdw11 = ((num_queues - 1) << 16) | (num_queues - 1); + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, cdw11, 0, + NULL, 0, cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + uint32_t cdw11; + + cdw11 = config.raw; + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0, + NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + uint32_t cdw11; + + if (host_id_size == 16) { + /* 128-bit extended host identifier */ + cdw11 = 1; + } else if (host_id_size == 8) { + /* 64-bit host identifier */ + cdw11 = 0; + } else { + SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size); + return -EINVAL; + } + + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER, cdw11, 0, + host_id, host_id_size, cb_fn, cb_arg); +} + +int +spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page, + uint32_t nsid, void *payload, uint32_t payload_size, + uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint32_t numd, numdl, numdu; + uint32_t lpol, lpou; + int rc; + + if (payload_size == 0) { + return -EINVAL; + } + + if (offset & 3) { + return -EINVAL; + } + + numd = payload_size / sizeof(uint32_t) - 1u; + numdl = numd & 0xFFFFu; + numdu = (numd >> 16) & 0xFFFFu; + + lpol = (uint32_t)offset; + lpou = (uint32_t)(offset >> 32); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (offset && !ctrlr->cdata.lpa.edlp) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE; + cmd->nsid = nsid; + cmd->cdw10 = numdl << 16; + cmd->cdw10 |= log_page; + cmd->cdw11 = numdu; + cmd->cdw12 = lpol; + cmd->cdw13 = lpou; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +static void +spdk_nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req, *next, *tmp; + struct spdk_nvme_ctrlr *ctrlr; + int rc; + + req = ctx; + ctrlr = (struct spdk_nvme_ctrlr *)req->user_buffer; + + ctrlr->outstanding_aborts--; + STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + ctrlr->outstanding_aborts++; + rc = nvme_ctrlr_submit_admin_request(ctrlr, next); + if (rc < 0) { + SPDK_ERRLOG("Failed to submit queued abort.\n"); + memset(&next->cpl, 0, sizeof(next->cpl)); + next->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + next->cpl.status.dnr = 1; + nvme_complete_request(next, &req->cpl); + nvme_free_request(next); + } else { + /* If the first abort succeeds, stop iterating. */ + break; + } + } + + req->user_cb_fn(req->user_cb_arg, cpl); +} + +int +spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint16_t sqid; + + if (qpair) { + sqid = qpair->id; + } else { + sqid = ctrlr->adminq->id; /* 0 */ + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, spdk_nvme_ctrlr_cmd_abort_cpl, NULL); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + req->cb_arg = req; + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + req->user_buffer = ctrlr; /* This is a hack to get to the ctrlr in the + * completion handler. */ + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_ABORT; + cmd->cdw10 = (cid << 16) | sqid; + + if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl) { + STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq); + rc = 0; + } else { + ctrlr->outstanding_aborts++; + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; + memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; + +} + +int +nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + cmd->cdw10 = (size >> 2) - 1; + cmd->cdw11 = offset >> 2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE; + cmd->cdw10 = ((uint32_t)secp << 24) | ((uint32_t)spsp << 8) | ((uint32_t)nssf); + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_SEND; + cmd->cdw10 = ((uint32_t)secp << 24) | ((uint32_t)spsp << 8) | ((uint32_t)nssf); + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c new file mode 100644 index 00000000..80de5328 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c @@ -0,0 +1,83 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +bool +spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->quirks & NVME_QUIRK_OCSSD) { + // TODO: There isn't a standardized way to identify Open-Channel SSD + // different verdors may have different conditions. + + /* + * Current QEMU OpenChannel Device needs to check nsdata->vs[0]. + * Here check nsdata->vs[0] of the first namespace. + */ + if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) { + if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) { + return true; + } + } + } + return false; +} + + +int +spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) { + return -EINVAL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_GEOMETRY; + cmd->nsid = nsid; + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c new file mode 100644 index 00000000..4589596a --- /dev/null +++ b/src/spdk/lib/nvme/nvme_fabric.c @@ -0,0 +1,340 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics transport-independent functions + */ + +#include "nvme_internal.h" + +#include "spdk/endian.h" +#include "spdk/string.h" + +static int +nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status status; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; + cmd.ofst = offset; + cmd.attrib.size = size; + cmd.value.u64 = value; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, + nvme_completion_poll_cb, &status); + if (rc < 0) { + return rc; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + SPDK_ERRLOG("Property Set failed\n"); + return -1; + } + + return 0; +} + +static int +nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t *value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status status; + struct spdk_nvmf_fabric_prop_get_rsp *response; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; + cmd.ofst = offset; + cmd.attrib.size = size; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, nvme_completion_poll_cb, + &status); + if (rc < 0) { + return rc; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + SPDK_ERRLOG("Property Get failed\n"); + return -1; + } + + response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status.cpl; + + if (size == SPDK_NVMF_PROP_SIZE_4) { + *value = response->value.u32.low; + } else { + *value = response->value.u64; + } + + return 0; +} + +int +nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value); +} + +int +nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +int +nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + uint64_t tmp_value; + int rc; + rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value); + + if (!rc) { + *value = (uint32_t)tmp_value; + } + return rc; +} + +int +nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +static void +nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry, + void *cb_ctx, spdk_nvme_probe_cb probe_cb) +{ + struct spdk_nvme_transport_id trid; + uint8_t *end; + size_t len; + + memset(&trid, 0, sizeof(trid)); + + if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_WARNLOG("Skipping unsupported discovery service referral\n"); + return; + } else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) { + SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype); + return; + } + + trid.trtype = entry->trtype; + if (!spdk_nvme_transport_available(trid.trtype)) { + SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n", + trid.trtype); + return; + } + + trid.adrfam = entry->adrfam; + + /* Ensure that subnqn is null terminated. */ + end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n"); + return; + } + len = end - entry->subnqn; + memcpy(trid.subnqn, entry->subnqn, len); + trid.subnqn[len] = '\0'; + + /* Convert traddr to a null terminated string. */ + len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' '); + memcpy(trid.traddr, entry->traddr, len); + if (spdk_str_chomp(trid.traddr) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n"); + } + + /* Convert trsvcid to a null terminated string. */ + len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' '); + memcpy(trid.trsvcid, entry->trsvcid, len); + if (spdk_str_chomp(trid.trsvcid) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n"); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n", + trid.subnqn, trid.trtype, + trid.traddr, trid.trsvcid); + + nvme_ctrlr_probe(&trid, NULL, probe_cb, cb_ctx); +} + +static int +nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr, + void *log_page, uint32_t size, uint64_t offset) +{ + struct nvme_completion_poll_status status; + int rc; + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset, + nvme_completion_poll_cb, &status); + if (rc < 0) { + return -1; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + return -1; + } + + return 0; +} + +int +nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, + void *cb_ctx, spdk_nvme_probe_cb probe_cb) +{ + struct spdk_nvmf_discovery_log_page *log_page; + struct spdk_nvmf_discovery_log_page_entry *log_page_entry; + char buffer[4096]; + int rc; + uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0; + uint64_t remaining_num_rec = 0; + uint16_t recfmt; + + memset(buffer, 0x0, 4096); + buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page, + entries[0])) / + sizeof(struct spdk_nvmf_discovery_log_page_entry); + buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry); + do { + rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n"); + return rc; + } + + if (!remaining_num_rec) { + log_page = (struct spdk_nvmf_discovery_log_page *)buffer; + recfmt = from_le16(&log_page->recfmt); + if (recfmt != 0) { + SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt); + return -EPROTO; + } + remaining_num_rec = log_page->numrec; + log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]); + log_page_entry = &log_page->entries[0]; + numrec = spdk_min(remaining_num_rec, buffer_max_entries_first); + } else { + numrec = spdk_min(remaining_num_rec, buffer_max_entries); + log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer; + } + + for (i = 0; i < numrec; i++) { + nvme_fabric_discover_probe(log_page_entry++, cb_ctx, probe_cb); + } + remaining_num_rec -= numrec; + log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry); + } while (remaining_num_rec != 0); + + return 0; +} + +int +nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries) +{ + struct nvme_completion_poll_status status; + struct spdk_nvmf_fabric_connect_rsp *rsp; + struct spdk_nvmf_fabric_connect_cmd cmd; + struct spdk_nvmf_fabric_connect_data *nvmf_data; + struct spdk_nvme_ctrlr *ctrlr; + int rc; + + if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) { + return -EINVAL; + } + + ctrlr = qpair->ctrlr; + if (!ctrlr) { + return -EINVAL; + } + + nvmf_data = spdk_dma_zmalloc(sizeof(*nvmf_data), 0, NULL); + if (!nvmf_data) { + SPDK_ERRLOG("nvmf_data allocation error\n"); + return -ENOMEM; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; + cmd.qid = qpair->id; + cmd.sqsize = num_entries - 1; + cmd.kato = ctrlr->opts.keep_alive_timeout_ms; + + if (nvme_qpair_is_admin_queue(qpair)) { + nvmf_data->cntlid = 0xFFFF; + } else { + nvmf_data->cntlid = ctrlr->cntlid; + } + + SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id), + "host ID size mismatch"); + memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid)); + snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn); + snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn); + + rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, + (struct spdk_nvme_cmd *)&cmd, + nvmf_data, sizeof(*nvmf_data), + nvme_completion_poll_cb, &status); + if (rc < 0) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_dma_free(nvmf_data); + return rc; + } + + if (spdk_nvme_wait_for_completion(qpair, &status)) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_dma_free(nvmf_data); + return -EIO; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status.cpl; + ctrlr->cntlid = rsp->status_code_specific.success.cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid); + } + + spdk_dma_free(nvmf_data); + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h new file mode 100644 index 00000000..6e7714a4 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_internal.h @@ -0,0 +1,1003 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVME_INTERNAL_H__ +#define __NVME_INTERNAL_H__ + +#include "spdk/config.h" +#include "spdk/likely.h" +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" + +#if defined(__i386__) || defined(__x86_64__) +#include +#endif + +#include "spdk/queue.h" +#include "spdk/barrier.h" +#include "spdk/bit_array.h" +#include "spdk/mmio.h" +#include "spdk/pci_ids.h" +#include "spdk/util.h" +#include "spdk/nvme_intel.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +extern pid_t g_spdk_nvme_pid; + +/* + * Some Intel devices support vendor-unique read latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_READ_LATENCY 0x1 + +/* + * Some Intel devices support vendor-unique write latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2 + +/* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ +#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4 + +/* + * The controller performs best when I/O is split on particular + * LBA boundaries. + */ +#define NVME_INTEL_QUIRK_STRIPING 0x8 + +/* + * The controller needs a delay after allocating an I/O queue pair + * before it is ready to accept I/O commands. + */ +#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10 + +/* + * Earlier NVMe devices do not indicate whether unmapped blocks + * will read all zeroes or not. This define indicates that the + * device does in fact read all zeroes after an unmap event + */ +#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20 + +/* + * The controller doesn't handle Identify value others than 0 or 1 correctly. + */ +#define NVME_QUIRK_IDENTIFY_CNS 0x40 + +/* + * The controller supports Open Channel command set if matching additional + * condition, like the first byte (value 0x1) in the vendor specific + * bits of the namespace identify structure is set. + */ +#define NVME_QUIRK_OCSSD 0x80 + +/* + * The controller has an Intel vendor ID but does not support Intel vendor-specific + * log pages. This is primarily for QEMU emulated SSDs which report an Intel vendor + * ID but do not support these log pages. + */ +#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100 + +#define NVME_MAX_ASYNC_EVENTS (8) + +#define NVME_MIN_TIMEOUT_PERIOD (5) +#define NVME_MAX_TIMEOUT_PERIOD (120) + +/* Maximum log page size to fetch for AERs. */ +#define NVME_MAX_AER_LOG_SIZE (4096) + +/* + * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this + * define specifies the maximum number of queues this driver will actually + * try to configure, if available. + */ +#define DEFAULT_MAX_IO_QUEUES (1024) +#define DEFAULT_IO_QUEUE_SIZE (256) + +#define DEFAULT_ADMIN_QUEUE_REQUESTS (32) +#define DEFAULT_IO_QUEUE_REQUESTS (512) + +/* We want to fit submission and completion rings each in a single 2MB + * hugepage to ensure physical address contiguity. + */ +#define MAX_IO_QUEUE_ENTRIES (0x200000 / spdk_max( \ + sizeof(struct spdk_nvme_cmd), \ + sizeof(struct spdk_nvme_cpl))) + +enum nvme_payload_type { + NVME_PAYLOAD_TYPE_INVALID = 0, + + /** nvme_request::u.payload.contig_buffer is valid for this request */ + NVME_PAYLOAD_TYPE_CONTIG, + + /** nvme_request::u.sgl is valid for this request */ + NVME_PAYLOAD_TYPE_SGL, +}; + +/* + * Controller support flags. + */ +enum spdk_nvme_ctrlr_flags { + SPDK_NVME_CTRLR_SGL_SUPPORTED = 0x1, /**< The SGL is supported */ +}; + +/** + * Descriptor for a request data payload. + */ +struct nvme_payload { + /** + * Functions for retrieving physical addresses for scattered payloads. + */ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn; + + /** + * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the + * virtual memory address of a single virtually contiguous buffer. + * + * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the + * cb_arg that will be passed to the SGL callback functions. + */ + void *contig_or_cb_arg; + + /** Virtual memory address of a single virtually contiguous metadata buffer */ + void *md; +}; + +#define NVME_PAYLOAD_CONTIG(contig_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = NULL, \ + .next_sge_fn = NULL, \ + .contig_or_cb_arg = (contig_), \ + .md = (md_), \ + } + +#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = (reset_sgl_fn_), \ + .next_sge_fn = (next_sge_fn_), \ + .contig_or_cb_arg = (cb_arg_), \ + .md = (md_), \ + } + +static inline enum nvme_payload_type +nvme_payload_type(const struct nvme_payload *payload) { + return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG; +} + +struct nvme_error_cmd { + bool do_not_submit; + uint64_t timeout_tsc; + uint32_t err_count; + uint8_t opc; + struct spdk_nvme_status status; + TAILQ_ENTRY(nvme_error_cmd) link; +}; + +struct nvme_request { + struct spdk_nvme_cmd cmd; + + uint8_t retries; + + bool timed_out; + + /** + * Number of children requests still outstanding for this + * request which was split into multiple child requests. + */ + uint16_t num_children; + + /** + * Offset in bytes from the beginning of payload for this request. + * This is used for I/O commands that are split into multiple requests. + */ + uint32_t payload_offset; + uint32_t md_offset; + + uint32_t payload_size; + + /** + * Timeout ticks for error injection requests, can be extended in future + * to support per-request timeout feature. + */ + uint64_t timeout_tsc; + + /** + * Data payload for this request's command. + */ + struct nvme_payload payload; + + spdk_nvme_cmd_cb cb_fn; + void *cb_arg; + STAILQ_ENTRY(nvme_request) stailq; + + struct spdk_nvme_qpair *qpair; + + /* + * The value of spdk_get_ticks() when the request was submitted to the hardware. + * Only set if ctrlr->timeout_enabled is true. + */ + uint64_t submit_tick; + + /** + * The active admin request can be moved to a per process pending + * list based on the saved pid to tell which process it belongs + * to. The cpl saves the original completion information which + * is used in the completion callback. + * NOTE: these below two fields are only used for admin request. + */ + pid_t pid; + struct spdk_nvme_cpl cpl; + + /** + * The following members should not be reordered with members + * above. These members are only needed when splitting + * requests which is done rarely, and the driver is careful + * to not touch the following fields until a split operation is + * needed, to avoid touching an extra cacheline. + */ + + /** + * Points to the outstanding child requests for a parent request. + * Only valid if a request was split into multiple children + * requests, and is not initialized for non-split requests. + */ + TAILQ_HEAD(, nvme_request) children; + + /** + * Linked-list pointers for a child request in its parent's list. + */ + TAILQ_ENTRY(nvme_request) child_tailq; + + /** + * Points to a parent request if part of a split request, + * NULL otherwise. + */ + struct nvme_request *parent; + + /** + * Completion status for a parent request. Initialized to all 0's + * (SUCCESS) before child requests are submitted. If a child + * request completes with error, the error status is copied here, + * to ensure that the parent request is also completed with error + * status once all child requests are completed. + */ + struct spdk_nvme_cpl parent_status; + + /** + * The user_cb_fn and user_cb_arg fields are used for holding the original + * callback data when using nvme_allocate_request_user_copy. + */ + spdk_nvme_cmd_cb user_cb_fn; + void *user_cb_arg; + void *user_buffer; +}; + +struct nvme_completion_poll_status { + struct spdk_nvme_cpl cpl; + bool done; +}; + +struct nvme_async_event_request { + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_request *req; + struct spdk_nvme_cpl cpl; +}; + +struct spdk_nvme_qpair { + STAILQ_HEAD(, nvme_request) free_req; + STAILQ_HEAD(, nvme_request) queued_req; + /** Commands opcode in this list will return error */ + TAILQ_HEAD(, nvme_error_cmd) err_cmd_head; + /** Requests in this list will return error */ + STAILQ_HEAD(, nvme_request) err_req_head; + + enum spdk_nvme_transport_type trtype; + + uint16_t id; + + uint8_t qprio; + + /* + * Members for handling IO qpair deletion inside of a completion context. + * These are specifically defined as single bits, so that they do not + * push this data structure out to another cacheline. + */ + uint8_t in_completion_context : 1; + uint8_t delete_after_completion_context: 1; + + /* + * Set when no deletion notification is needed. For example, the process + * which allocated this qpair exited unexpectedly. + */ + uint8_t no_deletion_notification_needed: 1; + + struct spdk_nvme_ctrlr *ctrlr; + + /* List entry for spdk_nvme_ctrlr::active_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) tailq; + + /* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) per_process_tailq; + + struct spdk_nvme_ctrlr_process *active_proc; + + void *req_buf; +}; + +struct spdk_nvme_ns { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t sector_size; + + /* + * Size of data transferred as part of each block, + * including metadata if FLBAS indicates the metadata is transferred + * as part of the data buffer at the end of each LBA. + */ + uint32_t extended_lba_size; + + uint32_t md_size; + uint32_t pi_type; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + uint32_t id; + uint16_t flags; + + /* Namespace Identification Descriptor List (CNS = 03h) */ + uint8_t id_desc_list[4096]; +}; + +/** + * State of struct spdk_nvme_ctrlr (in particular, during initialization). + */ +enum nvme_ctrlr_state { + /** + * Wait before initializing the controller. + */ + NVME_CTRLR_STATE_INIT_DELAY, + + /** + * Controller has not been initialized yet. + */ + NVME_CTRLR_STATE_INIT, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, + + /** + * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, + + /** + * Enable the controller by writing CC.EN to 1 + */ + NVME_CTRLR_STATE_ENABLE, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller. + */ + NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, + + /** + * Enable the Admin queue of the controller. + */ + NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE, + + /** + * Identify Controller command will be sent to then controller. + */ + NVME_CTRLR_STATE_IDENTIFY, + + /** + * Waiting for Identify Controller command be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, + + /** + * Set Number of Queues of the controller. + */ + NVME_CTRLR_STATE_SET_NUM_QUEUES, + + /** + * Waiting for Set Num of Queues command to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, + + /** + * Get Number of Queues of the controller. + */ + NVME_CTRLR_STATE_GET_NUM_QUEUES, + + /** + * Waiting for Get Num of Queues command to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES, + + /** + * Construct Namespace data structures of the controller. + */ + NVME_CTRLR_STATE_CONSTRUCT_NS, + + /** + * Get active Namespace list of the controller. + */ + NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, + + /** + * Get Identify Namespace Data structure for each NS. + */ + NVME_CTRLR_STATE_IDENTIFY_NS, + + /** + * Waiting for the Identify Namespace commands to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, + + /** + * Get Identify Namespace Identification Descriptors. + */ + NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, + + /** + * Waiting for the Identify Namespace Identification + * Descriptors to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, + + /** + * Configure AER of the controller. + */ + NVME_CTRLR_STATE_CONFIGURE_AER, + + /** + * Waiting for the Configure AER to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, + + /** + * Set supported log pages of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + + /** + * Set supported features of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, + + /** + * Set Doorbell Buffer Config of the controller. + */ + NVME_CTRLR_STATE_SET_DB_BUF_CFG, + + /** + * Waiting for Doorbell Buffer Config to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, + + /** + * Set Keep Alive Timeout of the controller. + */ + NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + + /** + * Waiting for Set Keep Alive Timeout to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, + + /** + * Set Host ID of the controller. + */ + NVME_CTRLR_STATE_SET_HOST_ID, + + /** + * Waiting for Set Host ID to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, + + /** + * Controller initialization has completed and the controller is ready. + */ + NVME_CTRLR_STATE_READY, + + /** + * Controller inilialization has an error. + */ + NVME_CTRLR_STATE_ERROR +}; + +#define NVME_TIMEOUT_INFINITE UINT64_MAX + +/* + * Used to track properties for all processes accessing the controller. + */ +struct spdk_nvme_ctrlr_process { + /** Whether it is the primary process */ + bool is_primary; + + /** Process ID */ + pid_t pid; + + /** Active admin requests to be completed */ + STAILQ_HEAD(, nvme_request) active_reqs; + + TAILQ_ENTRY(spdk_nvme_ctrlr_process) tailq; + + /** Per process PCI device handle */ + struct spdk_pci_device *devhandle; + + /** Reference to track the number of attachment to this controller. */ + int ref; + + /** Allocated IO qpairs */ + TAILQ_HEAD(, spdk_nvme_qpair) allocated_io_qpairs; + + spdk_nvme_aer_cb aer_cb_fn; + void *aer_cb_arg; + + /** + * A function pointer to timeout callback function + */ + spdk_nvme_timeout_cb timeout_cb_fn; + void *timeout_cb_arg; + uint64_t timeout_ticks; +}; + +/* + * One of these per allocated PCI device. + */ +struct spdk_nvme_ctrlr { + /* Hot data (accessed in I/O path) starts here. */ + + /** Array of namespaces indexed by nsid - 1 */ + struct spdk_nvme_ns *ns; + + struct spdk_nvme_transport_id trid; + + uint32_t num_ns; + + bool is_removed; + + bool is_resetting; + + bool is_failed; + + bool timeout_enabled; + + uint16_t max_sges; + + uint16_t cntlid; + + /** Controller support flags */ + uint64_t flags; + + /* Cold data (not accessed in normal I/O path) is after this point. */ + + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + + enum nvme_ctrlr_state state; + uint64_t state_timeout_tsc; + + uint64_t next_keep_alive_tick; + uint64_t keep_alive_interval_ticks; + + TAILQ_ENTRY(spdk_nvme_ctrlr) tailq; + + /** All the log pages supported */ + bool log_page_supported[256]; + + /** All the features supported */ + bool feature_supported[256]; + + /** maximum i/o size in bytes */ + uint32_t max_xfer_size; + + /** minimum page size supported by this controller in bytes */ + uint32_t min_page_size; + + /** selected memory page size for this controller in bytes */ + uint32_t page_size; + + uint32_t num_aers; + struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS]; + + /** guards access to the controller itself, including admin queues */ + pthread_mutex_t ctrlr_lock; + + + struct spdk_nvme_qpair *adminq; + + /** shadow doorbell buffer */ + uint32_t *shadow_doorbell; + /** eventidx buffer */ + uint32_t *eventidx; + + /** + * Identify Controller data. + */ + struct spdk_nvme_ctrlr_data cdata; + + /** + * Keep track of active namespaces + */ + uint32_t *active_ns_list; + + /** + * Array of Identify Namespace data. + * + * Stored separately from ns since nsdata should not normally be accessed during I/O. + */ + struct spdk_nvme_ns_data *nsdata; + + struct spdk_bit_array *free_io_qids; + TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs; + + struct spdk_nvme_ctrlr_opts opts; + + uint64_t quirks; + + /* Extra sleep time during controller initialization */ + uint64_t sleep_timeout_tsc; + + /** Track all the processes manage this controller */ + TAILQ_HEAD(, spdk_nvme_ctrlr_process) active_procs; + + + STAILQ_HEAD(, nvme_request) queued_aborts; + uint32_t outstanding_aborts; +}; + +struct nvme_driver { + pthread_mutex_t lock; + + /** Multi-process shared attached controller list */ + TAILQ_HEAD(, spdk_nvme_ctrlr) shared_attached_ctrlrs; + + bool initialized; + struct spdk_uuid default_extended_host_id; +}; + +extern struct nvme_driver *g_spdk_nvme_driver; + +int nvme_driver_init(void); + +/* + * Used for the spdk_nvme_connect() public API to save user specified opts. + */ +struct spdk_nvme_ctrlr_connect_opts { + const struct spdk_nvme_ctrlr_opts *opts; + size_t opts_size; +}; + +#define nvme_delay usleep + +static inline bool +nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id == 0; +} + +static inline bool +nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id != 0; +} + +static inline int +nvme_robust_mutex_lock(pthread_mutex_t *mtx) +{ + int rc = pthread_mutex_lock(mtx); + +#ifndef __FreeBSD__ + if (rc == EOWNERDEAD) { + rc = pthread_mutex_consistent(mtx); + } +#endif + + return rc; +} + +static inline int +nvme_robust_mutex_unlock(pthread_mutex_t *mtx) +{ + return pthread_mutex_unlock(mtx); +} + +/* Admin functions */ +int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, + uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, + uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +void nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl); +int spdk_nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status); +int spdk_nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex); + +struct spdk_nvme_ctrlr_process *spdk_nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, + pid_t pid); +struct spdk_nvme_ctrlr_process *spdk_nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle); +void nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr); +struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, void *devhandle, + spdk_nvme_probe_cb probe_cb, void *cb_ctx); + +int nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove); +int nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_connected(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req); +int nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap); +int nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs); +void nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs); +int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests); +void nvme_qpair_deinit(struct spdk_nvme_qpair *qpair); +void nvme_qpair_enable(struct spdk_nvme_qpair *qpair); +void nvme_qpair_disable(struct spdk_nvme_qpair *qpair); +int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req); + +int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_set_identify_data(struct spdk_nvme_ns *ns); +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_destruct(struct spdk_nvme_ns *ns); + +int nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); +int nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); +int nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); +int nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); +int nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, void *cb_ctx, + spdk_nvme_probe_cb probe_cb); +int nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries); + +static inline struct nvme_request * +nvme_allocate_request(struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = STAILQ_FIRST(&qpair->free_req); + if (req == NULL) { + return req; + } + + STAILQ_REMOVE_HEAD(&qpair->free_req, stailq); + + /* + * Only memset/zero fields that need it. All other fields + * will be initialized appropriately either later in this + * function, or before they are needed later in the + * submission patch. For example, the children + * TAILQ_ENTRY and following members are + * only used as part of I/O splitting so we avoid + * memsetting them until it is actually needed. + * They will be initialized in nvme_request_add_child() + * if the request is split. + */ + memset(req, 0, offsetof(struct nvme_request, payload_size)); + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->payload = *payload; + req->payload_size = payload_size; + req->qpair = qpair; + req->pid = g_spdk_nvme_pid; + + return req; +} + +static inline struct nvme_request * +nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + return nvme_allocate_request(qpair, &payload, payload_size, cb_fn, cb_arg); +} + +static inline struct nvme_request * +nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg); +} + +struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller); + +static inline void +nvme_complete_request(struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_qpair *qpair = req->qpair; + struct spdk_nvme_cpl err_cpl; + struct nvme_error_cmd *cmd; + + /* error injection at completion path, + * only inject for successful completed commands + */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) && + !spdk_nvme_cpl_is_error(cpl))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + + if (cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + + err_cpl = *cpl; + err_cpl.status.sct = cmd->status.sct; + err_cpl.status.sc = cmd->status.sc; + + cpl = &err_cpl; + cmd->err_count--; + break; + } + } + } + + if (req->cb_fn) { + req->cb_fn(req->cb_arg, cpl); + } +} + +static inline void +nvme_free_request(struct nvme_request *req) +{ + assert(req != NULL); + assert(req->num_children == 0); + assert(req->qpair != NULL); + + STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq); +} + +void nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child); +int nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick); +uint64_t nvme_get_quirks(const struct spdk_pci_id *id); + +int nvme_robust_mutex_init_shared(pthread_mutex_t *mtx); +int nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx); + +bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl); +void nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd); +void nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl); + +struct spdk_nvme_ctrlr *spdk_nvme_get_ctrlr_by_trid_unsafe( + const struct spdk_nvme_transport_id *trid); + +/* Transport specific functions */ +#define DECLARE_TRANSPORT(name) \ + struct spdk_nvme_ctrlr *nvme_ ## name ## _ctrlr_construct(const struct spdk_nvme_transport_id *trid, const struct spdk_nvme_ctrlr_opts *opts, \ + void *devhandle); \ + int nvme_ ## name ## _ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); \ + int nvme_ ## name ## _ctrlr_scan(const struct spdk_nvme_transport_id *trid, void *cb_ctx, spdk_nvme_probe_cb probe_cb, spdk_nvme_remove_cb remove_cb, bool direct_connect); \ + int nvme_ ## name ## _ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr); \ + int nvme_ ## name ## _ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); \ + int nvme_ ## name ## _ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); \ + int nvme_ ## name ## _ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); \ + int nvme_ ## name ## _ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); \ + uint32_t nvme_ ## name ## _ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr); \ + uint16_t nvme_ ## name ## _ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr); \ + struct spdk_nvme_qpair *nvme_ ## name ## _ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts); \ + void *nvme_ ## name ## _ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size); \ + int nvme_ ## name ## _ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size); \ + int nvme_ ## name ## _ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _qpair_enable(struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _qpair_disable(struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _qpair_reset(struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _qpair_fail(struct spdk_nvme_qpair *qpair); \ + int nvme_ ## name ## _qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); \ + int32_t nvme_ ## name ## _qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions); + +DECLARE_TRANSPORT(transport) /* generic transport dispatch functions */ +DECLARE_TRANSPORT(pcie) +#ifdef SPDK_CONFIG_RDMA +DECLARE_TRANSPORT(rdma) +#endif + +#undef DECLARE_TRANSPORT + +/* + * Below ref related functions must be called with the global + * driver lock held for the multi-process condition. + * Within these functions, the per ctrlr ctrlr_lock is also + * acquired for the multi-thread condition. + */ +void nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr); + +static inline bool +_is_page_aligned(uint64_t address, uint64_t page_size) +{ + return (address & (page_size - 1)) == 0; +} + +#endif /* __NVME_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c new file mode 100644 index 00000000..b88bf174 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns.c @@ -0,0 +1,360 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static inline struct spdk_nvme_ns_data * +_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return &ns->ctrlr->nsdata[ns->id - 1]; +} + +/** + * Update Namespace flags based on Identify Controller + * and Identify Namespace. This can be also used for + * Namespace Attribute Notice events and Namespace + * operations such as Attach/Detach. + */ +void +nvme_ns_set_identify_data(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + nsdata = _nvme_ns_get_data(ns); + + ns->flags = 0x0000; + + ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads; + ns->extended_lba_size = ns->sector_size; + + ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms; + if (nsdata->flbas.extended) { + ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED; + ns->extended_lba_size += ns->md_size; + } + + ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size; + + if (nsdata->noiob) { + ns->sectors_per_stripe = nsdata->noiob; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING && + ns->ctrlr->cdata.vs[3] != 0) { + ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size / + ns->sector_size; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else { + ns->sectors_per_stripe = 0; + } + + if (ns->ctrlr->cdata.oncs.dsm) { + ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED; + } + + if (ns->ctrlr->cdata.vwc.present) { + ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.write_zeroes) { + ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED; + } + + if (nsdata->nsrescap.raw) { + ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED; + } + + ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE; + if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) { + ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED; + ns->pi_type = nsdata->dps.pit; + } +} + +static int +nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status status; + struct spdk_nvme_ns_data *nsdata; + int rc; + + nsdata = _nvme_ns_get_data(ns); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_completion_poll_cb, &status); + if (rc != 0) { + return rc; + } + + if (spdk_nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, &status, + &ns->ctrlr->ctrlr_lock)) { + /* This can occur if the namespace is not active. Simply zero the + * namespace data and continue. */ + nvme_ns_destruct(ns); + return 0; + } + + nvme_ns_set_identify_data(ns); + + return 0; +} + +static int +nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status status; + int rc; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + return 0; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n"); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id, + ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_completion_poll_cb, &status); + if (rc < 0) { + return rc; + } + + rc = spdk_nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, &status, &ns->ctrlr->ctrlr_lock); + if (rc != 0) { + SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n"); + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + } + + return rc; +} + +uint32_t +spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns) +{ + return ns->id; +} + +bool +spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns) +{ + const struct spdk_nvme_ns_data *nsdata = NULL; + + /* + * According to the spec, valid NS has non-zero id. + */ + if (ns->id == 0) { + return false; + } + + nsdata = _nvme_ns_get_data(ns); + + /* + * According to the spec, Identify Namespace will return a zero-filled structure for + * inactive namespace IDs. + * Check NCAP since it must be nonzero for an active namespace. + */ + return nsdata->ncap != 0; +} + +struct spdk_nvme_ctrlr * +spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr; +} + +uint32_t +spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr->max_xfer_size; +} + +uint32_t +spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->sector_size; +} + +uint32_t +spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->extended_lba_size; +} + +uint64_t +spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns)->nsze; +} + +uint64_t +spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns) +{ + return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns); +} + +uint32_t +spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns) +{ + return ns->flags; +} + +enum spdk_nvme_pi_type +spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) { + return ns->pi_type; +} + +bool +spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns) +{ + return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false; +} + +uint32_t +spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns) +{ + return ns->md_size; +} + +const struct spdk_nvme_ns_data * +spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns); +} + +enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value( + struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns); + + if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) { + return SPDK_NVME_DEALLOC_READ_00; + } else { + return data->dlfeat.bits.read_value; + } +} + +uint32_t +spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns) +{ + return ns->sectors_per_stripe; +} + +static const void * +_spdk_nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length) +{ + const struct spdk_nvme_ns_id_desc *desc; + size_t offset; + + offset = 0; + while (offset + 4 < sizeof(ns->id_desc_list)) { + desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset]; + + if (desc->nidl == 0) { + /* End of list */ + return NULL; + } + + /* + * Check if this descriptor fits within the list. + * 4 is the fixed-size descriptor header (not counted in NIDL). + */ + if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) { + /* Descriptor longer than remaining space in list (invalid) */ + return NULL; + } + + if (desc->nidt == type) { + *length = desc->nidl; + return &desc->nid[0]; + } + + offset += 4 + desc->nidl; + } + + return NULL; +} + +const struct spdk_uuid * +spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns) +{ + const struct spdk_uuid *uuid; + size_t uuid_size; + + uuid = _spdk_nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size); + if (uuid == NULL || uuid_size != sizeof(*uuid)) { + return NULL; + } + + return uuid; +} + +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + assert(id > 0); + + ns->ctrlr = ctrlr; + ns->id = id; + + rc = nvme_ctrlr_identify_ns(ns); + if (rc != 0) { + return rc; + } + + return nvme_ctrlr_identify_id_desc(ns); +} + +void nvme_ns_destruct(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + if (!ns->id) { + return; + } + + nsdata = _nvme_ns_get_data(ns); + memset(nsdata, 0, sizeof(*nsdata)); + ns->sector_size = 0; + ns->extended_lba_size = 0; + ns->md_size = 0; + ns->pi_type = 0; + ns->sectors_per_max_io = 0; + ns->sectors_per_stripe = 0; + ns->flags = 0; +} diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c new file mode 100644 index 00000000..9562cf5a --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_cmd.c @@ -0,0 +1,1026 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t opc, uint32_t io_flags, + uint16_t apptag_mask, uint16_t apptag, bool check_sgl); + + +static bool +spdk_nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io, + uint32_t sectors_per_stripe, uint32_t qdepth) +{ + uint32_t child_per_io; + + if (sectors_per_stripe > 0) { + child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe; + } else { + child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io); + + return child_per_io >= qdepth; +} + +static void +nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *child = child_arg; + struct nvme_request *parent = child->parent; + + nvme_request_remove_child(parent, child); + + if (spdk_nvme_cpl_is_error(cpl)) { + memcpy(&parent->parent_status, cpl, sizeof(*cpl)); + } + + if (parent->num_children == 0) { + nvme_complete_request(parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static void +nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent->num_children != UINT16_MAX); + + if (parent->num_children == 0) { + /* + * Defer initialization of the children TAILQ since it falls + * on a separate cacheline. This ensures we do not touch this + * cacheline except on request splitting cases, which are + * relatively rare. + */ + TAILQ_INIT(&parent->children); + parent->parent = NULL; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + } + + parent->num_children++; + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + child->parent = parent; + child->cb_fn = nvme_cb_complete_child; + child->cb_arg = child; +} + +void +nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent != NULL); + assert(child != NULL); + assert(child->parent == parent); + assert(parent->num_children != 0); + + parent->num_children--; + TAILQ_REMOVE(&parent->children, child, child_tailq); +} + +static void +nvme_request_free_children(struct nvme_request *req) +{ + struct nvme_request *child, *tmp; + + if (req->num_children == 0) { + return; + } + + /* free all child nvme_request */ + TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) { + nvme_request_remove_child(req, child); + nvme_request_free_children(child); + nvme_free_request(child); + } +} + +static struct nvme_request * +_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, + struct nvme_request *parent, bool check_sgl) +{ + struct nvme_request *child; + + child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn, + cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl); + if (child == NULL) { + nvme_request_free_children(parent); + nvme_free_request(parent); + return NULL; + } + + nvme_request_add_child(parent, child); + return child; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint32_t sectors_per_max_io, uint32_t sector_mask, + uint16_t apptag_mask, uint16_t apptag) +{ + uint32_t sector_size; + uint32_t md_size = ns->md_size; + uint32_t remaining_lba_count = lba_count; + struct nvme_request *child; + + sector_size = ns->extended_lba_size; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (md_size == 8)) { + sector_size -= 8; + } + + while (remaining_lba_count > 0) { + lba_count = sectors_per_max_io - (lba & sector_mask); + lba_count = spdk_min(remaining_lba_count, lba_count); + + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, + io_flags, apptag_mask, apptag, req, true); + if (child == NULL) { + return NULL; + } + + remaining_lba_count -= lba_count; + lba += lba_count; + payload_offset += lba_count * sector_size; + md_offset += lba_count * md_size; + } + + return req; +} + +static void +_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req, + uint32_t opc, uint64_t lba, uint32_t lba_count, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct spdk_nvme_cmd *cmd; + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + *(uint64_t *)&cmd->cdw10 = lba; + + if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + switch (ns->pi_type) { + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1: + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2: + cmd->cdw14 = (uint32_t)lba; + break; + } + } + + cmd->cdw12 = lba_count - 1; + cmd->cdw12 |= io_flags; + + cmd->cdw15 = apptag_mask; + cmd->cdw15 = (cmd->cdw15 << 16 | apptag); +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + bool start_valid, end_valid, last_sge, child_equals_parent; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint32_t page_size = qpair->ctrlr->page_size; + uintptr_t address; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + while (req_current_length < req->payload_size) { + + if (sge_length == 0) { + continue; + } else if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + /* + * The start of the SGE is invalid if the start address is not page aligned, + * unless it is the first SGE in the child request. + */ + start_valid = child_length == 0 || _is_page_aligned(address, page_size); + + /* Boolean for whether this is the last SGE in the parent request. */ + last_sge = (req_current_length + sge_length == req->payload_size); + + /* + * The end of the SGE is invalid if the end address is not page aligned, + * unless it is the last SGE in the parent request. + */ + end_valid = last_sge || _is_page_aligned(address + sge_length, page_size); + + /* + * This child request equals the parent request, meaning that no splitting + * was required for the parent request (the one passed into this function). + * In this case, we do not create a child request at all - we just send + * the original request as a single request at the end of this function. + */ + child_equals_parent = (child_length + sge_length == req->payload_size); + + if (start_valid) { + /* + * The start of the SGE is valid, so advance the length parameters, + * to include this SGE with previous SGEs for this child request + * (if any). If it is not valid, we do not advance the length + * parameters nor get the next SGE, because we must send what has + * been collected before this SGE as a child request. + */ + child_length += sge_length; + req_current_length += sge_length; + if (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + } + /* + * If the next SGE is not page aligned, we will need to create a child + * request for what we have so far, and then start a new child request for + * the next SGE. + */ + start_valid = _is_page_aligned(address, page_size); + } + + if (start_valid && end_valid && !last_sge) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if child_equals_parent allows us to *not* create a child request + * when no splitting is required - in that case we will fall-through and just create + * a single request with no children for the entire I/O. + */ + if (!child_equals_parent) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint16_t max_sges, num_sges; + uintptr_t address; + + max_sges = ns->ctrlr->max_sges; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + num_sges = 0; + + while (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + + if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + child_length += sge_length; + req_current_length += sge_length; + num_sges++; + + if (num_sges < max_sges) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if the child equals the full payload allows us to *not* + * create a child request when no splitting is required - in that case we will + * fall-through and just create a single request with no children for the entire I/O. + */ + if (child_length != req->payload_size) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + num_sges = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static struct nvme_request * +_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl) +{ + struct nvme_request *req; + uint32_t sector_size; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + + if (io_flags & 0xFFFF) { + /* The bottom 16 bits must be empty */ + SPDK_ERRLOG("io_flags 0x%x bottom 16 bits is not empty\n", + io_flags); + return NULL; + } + + sector_size = ns->extended_lba_size; + sectors_per_max_io = ns->sectors_per_max_io; + sectors_per_stripe = ns->sectors_per_stripe; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (ns->md_size == 8)) { + sector_size -= 8; + } + + req = nvme_allocate_request(qpair, payload, lba_count * sector_size, cb_fn, cb_arg); + if (req == NULL) { + return NULL; + } + + req->payload_offset = payload_offset; + req->md_offset = md_offset; + + /* + * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping. + * If this controller defines a stripe boundary and this I/O spans a stripe + * boundary, split the request into multiple requests and submit each + * separately to hardware. + */ + if (sectors_per_stripe > 0 && + (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) { + + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag); + } else if (lba_count > sectors_per_max_io) { + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) { + if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { + return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } else { + return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } + } + + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + return req; +} + +int +spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (spdk_nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint64_t *tmp_lba; + + if (lba_count == 0 || lba_count > UINT16_MAX + 1) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES; + cmd->nsid = ns->id; + + tmp_lba = (uint64_t *)&cmd->cdw10; + *tmp_lba = lba; + cmd->cdw12 = lba_count - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint32_t type, + const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) { + return -EINVAL; + } + + if (ranges == NULL) { + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(qpair, (void *)ranges, + num_ranges * sizeof(struct spdk_nvme_dsm_range), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = ns->id; + + cmd->cdw10 = num_ranges - 1; + cmd->cdw11 = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FLUSH; + cmd->nsid = ns->id; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_register_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_register_action action, + enum spdk_nvme_reservation_register_cptpl cptpl, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_register_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER; + cmd->nsid = ns->id; + + /* Bits 0-2 */ + cmd->cdw10 = action; + /* Bit 3 */ + cmd->cdw10 |= ignore_key ? 1 << 3 : 0; + /* Bits 30-31 */ + cmd->cdw10 |= (uint32_t)cptpl << 30; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_key_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_release_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn, + cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE; + cmd->nsid = ns->id; + + /* Bits 0-2 */ + cmd->cdw10 = action; + /* Bit 3 */ + cmd->cdw10 |= ignore_key ? 1 << 3 : 0; + /* Bits 8-15 */ + cmd->cdw10 |= (uint32_t)type << 8; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_acquire_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_acquire_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_acquire_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE; + cmd->nsid = ns->id; + + /* Bits 0-2 */ + cmd->cdw10 = action; + /* Bit 3 */ + cmd->cdw10 |= ignore_key ? 1 << 3 : 0; + /* Bits 8-15 */ + cmd->cdw10 |= (uint32_t)type << 8; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *payload, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + uint32_t num_dwords; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (len % 4) { + return -EINVAL; + } + num_dwords = len / 4; + + req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT; + cmd->nsid = ns->id; + + cmd->cdw10 = num_dwords; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c new file mode 100644 index 00000000..2a574992 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c @@ -0,0 +1,232 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +int +spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *lba_list, uint32_t num_lbas, + struct spdk_ocssd_chunk_information_entry *chunk_info, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (!lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET; + cmd->nsid = ns->id; + + if (chunk_info != NULL) { + cmd->mptr = spdk_vtophys(chunk_info); + } + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list); + } + + cmd->cdw12 = num_lbas - 1; + + return nvme_qpair_submit_request(qpair, req); +} + +static int +_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + enum spdk_ocssd_io_opcode opc, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + struct nvme_payload payload; + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!buffer || !lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *dst_lba_list, + uint64_t *src_lba_list, + uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!dst_lba_list || !src_lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of source logical + * block addresses. + * Dword 14 and 15 store a pointer to the list of destination logical + * block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *src_lba_list; + *(uint64_t *)&cmd->cdw14 = *dst_lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list); + *(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c new file mode 100644 index 00000000..8042380c --- /dev/null +++ b/src/spdk/lib/nvme/nvme_pcie.c @@ -0,0 +1,2142 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * Copyright (c) 2017, IBM Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over PCIe transport + */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/likely.h" +#include "nvme_internal.h" +#include "nvme_uevent.h" + +/* + * Number of completion queue entries to process before ringing the + * completion queue doorbell. + */ +#define NVME_MIN_COMPLETIONS (1) +#define NVME_MAX_COMPLETIONS (128) + +#define NVME_ADMIN_ENTRIES (128) + +/* + * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL + * segment. + */ +#define NVME_MAX_SGL_DESCRIPTORS (253) + +#define NVME_MAX_PRP_LIST_ENTRIES (506) + +struct nvme_pcie_enum_ctx { + spdk_nvme_probe_cb probe_cb; + void *cb_ctx; + struct spdk_pci_addr pci_addr; + bool has_pci_addr; +}; + +/* PCIe transport extensions for spdk_nvme_ctrlr */ +struct nvme_pcie_ctrlr { + struct spdk_nvme_ctrlr ctrlr; + + /** NVMe MMIO register space */ + volatile struct spdk_nvme_registers *regs; + + /** NVMe MMIO register size */ + uint64_t regs_size; + + /* BAR mapping address which contains controller memory buffer */ + void *cmb_bar_virt_addr; + + /* BAR physical address which contains controller memory buffer */ + uint64_t cmb_bar_phys_addr; + + /* Controller memory buffer size in Bytes */ + uint64_t cmb_size; + + /* Current offset of controller memory buffer, relative to start of BAR virt addr */ + uint64_t cmb_current_offset; + + /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */ + uint64_t cmb_max_offset; + + void *cmb_mem_register_addr; + size_t cmb_mem_register_size; + + bool cmb_io_data_supported; + + /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ + uint32_t doorbell_stride_u32; + + /* Opaque handle to associated PCI device. */ + struct spdk_pci_device *devhandle; + + /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */ + int claim_fd; + + /* Flag to indicate the MMIO register has been remapped */ + bool is_remapped; +}; + +struct nvme_tracker { + TAILQ_ENTRY(nvme_tracker) tq_list; + + struct nvme_request *req; + uint16_t cid; + + uint16_t rsvd1: 15; + uint16_t active: 1; + + uint32_t rsvd2; + + uint64_t rsvd3; + + uint64_t prp_sgl_bus_addr; + + union { + uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; + struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; + } u; +}; +/* + * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary + * and so that there is no padding required to meet alignment requirements. + */ +SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); +SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); + +/* PCIe transport extensions for spdk_nvme_qpair */ +struct nvme_pcie_qpair { + /* Submission queue tail doorbell */ + volatile uint32_t *sq_tdbl; + + /* Completion queue head doorbell */ + volatile uint32_t *cq_hdbl; + + /* Submission queue shadow tail doorbell */ + volatile uint32_t *sq_shadow_tdbl; + + /* Completion queue shadow head doorbell */ + volatile uint32_t *cq_shadow_hdbl; + + /* Submission queue event index */ + volatile uint32_t *sq_eventidx; + + /* Completion queue event index */ + volatile uint32_t *cq_eventidx; + + /* Submission queue */ + struct spdk_nvme_cmd *cmd; + + /* Completion queue */ + struct spdk_nvme_cpl *cpl; + + TAILQ_HEAD(, nvme_tracker) free_tr; + TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; + + /* Array of trackers indexed by command ID. */ + struct nvme_tracker *tr; + + uint16_t num_entries; + + uint16_t max_completions_cap; + + uint16_t sq_tail; + uint16_t cq_head; + uint16_t sq_head; + + uint8_t phase; + + bool is_enabled; + + /* + * Base qpair structure. + * This is located after the hot data in this structure so that the important parts of + * nvme_pcie_qpair are in the same cache line. + */ + struct spdk_nvme_qpair qpair; + + /* + * Fields below this point should not be touched on the normal I/O path. + */ + + bool sq_in_cmb; + + uint64_t cmd_bus_addr; + uint64_t cpl_bus_addr; +}; + +static int nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx, + struct spdk_pci_addr *pci_addr); +static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair); +static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); + +__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; +static volatile uint16_t g_signal_lock; +static bool g_sigset = false; +static int hotplug_fd = -1; + +static void +nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) +{ + void *map_address; + + if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) { + return; + } + + assert(g_thread_mmio_ctrlr != NULL); + + if (!g_thread_mmio_ctrlr->is_remapped) { + map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (map_address == MAP_FAILED) { + SPDK_ERRLOG("mmap failed\n"); + g_signal_lock = 0; + return; + } + memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); + g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; + g_thread_mmio_ctrlr->is_remapped = true; + } + g_signal_lock = 0; + return; +} + +static void +nvme_pcie_ctrlr_setup_signal(void) +{ + struct sigaction sa; + + sa.sa_sigaction = nvme_sigbus_fault_sighandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(SIGBUS, &sa, NULL); +} + +static int +_nvme_pcie_hotplug_monitor(void *cb_ctx, spdk_nvme_probe_cb probe_cb, + spdk_nvme_remove_cb remove_cb) +{ + struct spdk_nvme_ctrlr *ctrlr, *tmp; + struct spdk_uevent event; + struct spdk_pci_addr pci_addr; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_process *proc; + + while (spdk_get_uevent(hotplug_fd, &event) > 0) { + if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || + event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { + if (event.action == SPDK_NVME_UEVENT_ADD) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", + event.traddr); + if (spdk_process_is_primary()) { + if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { + nvme_pcie_ctrlr_attach(probe_cb, cb_ctx, &pci_addr); + } + } + } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { + struct spdk_nvme_transport_id trid; + + memset(&trid, 0, sizeof(trid)); + trid.trtype = SPDK_NVME_TRANSPORT_PCIE; + snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); + + ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); + if (ctrlr == NULL) { + return 0; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", + event.traddr); + + nvme_ctrlr_fail(ctrlr, true); + + /* get the user app to clean up and stop I/O */ + if (remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + remove_cb(cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + } + + /* This is a work around for vfio-attached device hot remove detection. */ + TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { + /* NVMe controller BAR must be mapped to secondary process space before any access. */ + proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (proc) { + csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr); + if (csts.raw == 0xffffffffU) { + nvme_ctrlr_fail(ctrlr, true); + if (remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + remove_cb(cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + } + return 0; +} + +static inline struct nvme_pcie_ctrlr * +nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); +} + +static inline struct nvme_pcie_qpair * +nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); +} + +static volatile void * +nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + return (volatile void *)((uintptr_t)pctrlr->regs + offset); +} + +int +nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +int +nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +int +nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +int +nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), + value); +} + +static int +nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), + value); +} + +static int +nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) +{ + return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), + aqa->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), + &cmbloc->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), + &cmbsz->raw); +} + +uint32_t +nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* + * For commands requiring more than 2 PRP entries, one PRP will be + * embedded in the command (prp1), and the rest of the PRP entries + * will be in a list pointed to by the command (prp2). This means + * that real max number of PRP entries we support is 506+1, which + * results in a max xfer size of 506*ctrlr->page_size. + */ + return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; +} + +uint16_t +nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + return NVME_MAX_SGL_DESCRIPTORS; +} + +static void +nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr; + uint32_t bir; + union spdk_nvme_cmbsz_register cmbsz; + union spdk_nvme_cmbloc_register cmbloc; + uint64_t size, unit_size, offset, bar_size, bar_phys_addr; + uint64_t mem_register_start, mem_register_end; + + if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || + nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get registers failed\n"); + goto exit; + } + + if (!cmbsz.bits.sz) { + goto exit; + } + + bir = cmbloc.bits.bir; + /* Values 0 2 3 4 5 are valid for BAR */ + if (bir > 5 || bir == 1) { + goto exit; + } + + /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ + unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); + /* controller memory buffer size in Bytes */ + size = unit_size * cmbsz.bits.sz; + /* controller memory buffer offset from BAR in Bytes */ + offset = unit_size * cmbloc.bits.ofst; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, + &bar_phys_addr, &bar_size); + if ((rc != 0) || addr == NULL) { + goto exit; + } + + if (offset > bar_size) { + goto exit; + } + + if (size > bar_size - offset) { + goto exit; + } + + pctrlr->cmb_bar_virt_addr = addr; + pctrlr->cmb_bar_phys_addr = bar_phys_addr; + pctrlr->cmb_size = size; + pctrlr->cmb_current_offset = offset; + pctrlr->cmb_max_offset = offset + size; + + if (!cmbsz.bits.sqs) { + pctrlr->ctrlr.opts.use_cmb_sqs = false; + } + + /* If only SQS is supported use legacy mapping */ + if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) { + return; + } + + /* If CMB is less than 4MiB in size then abort CMB mapping */ + if (pctrlr->cmb_size < (1ULL << 22)) { + goto exit; + } + + mem_register_start = (((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + 0x1fffff) & ~(0x200000 - 1)); + mem_register_end = ((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size); + mem_register_end &= ~(uint64_t)(0x200000 - 1); + pctrlr->cmb_mem_register_addr = (void *)mem_register_start; + pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start; + + rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); + if (rc) { + SPDK_ERRLOG("spdk_mem_register() failed\n"); + goto exit; + } + pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr); + pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr); + pctrlr->cmb_io_data_supported = true; + + return; +exit: + pctrlr->cmb_bar_virt_addr = NULL; + pctrlr->ctrlr.opts.use_cmb_sqs = false; + return; +} + +static int +nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + union spdk_nvme_cmbloc_register cmbloc; + void *addr = pctrlr->cmb_bar_virt_addr; + + if (addr) { + if (pctrlr->cmb_mem_register_addr) { + spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size); + } + + if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get_cmbloc() failed\n"); + return -EIO; + } + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned, + uint64_t *offset) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + uint64_t round_offset; + + round_offset = pctrlr->cmb_current_offset; + round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1); + + /* CMB may only consume part of the BAR, calculate accordingly */ + if (round_offset + length > pctrlr->cmb_max_offset) { + SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); + return -1; + } + + *offset = round_offset; + pctrlr->cmb_current_offset = round_offset + length; + + return 0; +} + +void * +nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + uint64_t offset; + + if (pctrlr->cmb_bar_virt_addr == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); + return NULL; + } + + if (!pctrlr->cmb_io_data_supported) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n"); + return NULL; + } + + if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size); + return NULL; + } + + return pctrlr->cmb_bar_virt_addr + offset; +} + +int +nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) +{ + /* + * Do nothing for now. + * TODO: Track free space so buffers may be reused. + */ + SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n", + __func__); + return 0; +} + +static int +nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr; + uint64_t phys_addr, size; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, + &phys_addr, &size); + pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; + if ((pctrlr->regs == NULL) || (rc != 0)) { + SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", + rc, pctrlr->regs); + return -1; + } + + pctrlr->regs_size = size; + nvme_pcie_ctrlr_map_cmb(pctrlr); + + return 0; +} + +static int +nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + void *addr = (void *)pctrlr->regs; + + if (pctrlr->ctrlr.is_removed) { + return rc; + } + + rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); + if (rc != 0) { + SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); + return -1; + } + + if (addr) { + /* NOTE: addr may have been remapped here. We're relying on DPDK to call + * munmap internally. + */ + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_qpair *pqpair; + int rc; + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return -ENOMEM; + } + + pqpair->num_entries = NVME_ADMIN_ENTRIES; + + ctrlr->adminq = &pqpair->qpair; + + rc = nvme_qpair_init(ctrlr->adminq, + 0, /* qpair ID */ + ctrlr, + SPDK_NVME_QPRIO_URGENT, + NVME_ADMIN_ENTRIES); + if (rc != 0) { + return rc; + } + + return nvme_pcie_qpair_construct(ctrlr->adminq); +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct spdk_nvme_transport_id trid = {}; + struct nvme_pcie_enum_ctx *enum_ctx = ctx; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_pci_addr pci_addr; + + pci_addr = spdk_pci_device_get_addr(pci_dev); + + trid.trtype = SPDK_NVME_TRANSPORT_PCIE; + spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); + + /* Verify that this controller is not already attached */ + ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid); + if (ctrlr) { + if (spdk_process_is_primary()) { + /* Already attached */ + return 0; + } else { + return nvme_ctrlr_add_process(ctrlr, pci_dev); + } + } + + /* check whether user passes the pci_addr */ + if (enum_ctx->has_pci_addr && + (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { + return 1; + } + + return nvme_ctrlr_probe(&trid, pci_dev, + enum_ctx->probe_cb, enum_ctx->cb_ctx); +} + +int +nvme_pcie_ctrlr_scan(const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_remove_cb remove_cb, + bool direct_connect) +{ + struct nvme_pcie_enum_ctx enum_ctx = {}; + + enum_ctx.probe_cb = probe_cb; + enum_ctx.cb_ctx = cb_ctx; + + if (strlen(trid->traddr) != 0) { + if (spdk_pci_addr_parse(&enum_ctx.pci_addr, trid->traddr)) { + return -1; + } + enum_ctx.has_pci_addr = true; + } + + if (hotplug_fd < 0) { + hotplug_fd = spdk_uevent_connect(); + if (hotplug_fd < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); + } + } else { + _nvme_pcie_hotplug_monitor(cb_ctx, probe_cb, remove_cb); + } + + if (enum_ctx.has_pci_addr == false) { + return spdk_pci_nvme_enumerate(pcie_nvme_enum_cb, &enum_ctx); + } else { + return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); + } +} + +static int +nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx, struct spdk_pci_addr *pci_addr) +{ + struct nvme_pcie_enum_ctx enum_ctx; + + enum_ctx.probe_cb = probe_cb; + enum_ctx.cb_ctx = cb_ctx; + + return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, pci_addr); +} + +struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct spdk_pci_device *pci_dev = devhandle; + struct nvme_pcie_ctrlr *pctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + uint32_t cmd_reg; + int rc, claim_fd; + struct spdk_pci_id pci_id; + struct spdk_pci_addr pci_addr; + + if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) { + SPDK_ERRLOG("could not parse pci address\n"); + return NULL; + } + + claim_fd = spdk_pci_device_claim(&pci_addr); + if (claim_fd < 0) { + SPDK_ERRLOG("could not claim device %s\n", trid->traddr); + return NULL; + } + + pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pctrlr == NULL) { + close(claim_fd); + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + pctrlr->is_remapped = false; + pctrlr->ctrlr.is_removed = false; + pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE; + pctrlr->devhandle = devhandle; + pctrlr->ctrlr.opts = *opts; + pctrlr->claim_fd = claim_fd; + memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid)); + + rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); + if (rc != 0) { + close(claim_fd); + spdk_free(pctrlr); + return NULL; + } + + /* Enable PCI busmaster and disable INTx */ + spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4); + cmd_reg |= 0x404; + spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4); + + if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + close(claim_fd); + spdk_free(pctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + close(claim_fd); + spdk_free(pctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); + + /* Doorbell stride is 2 ^ (dstrd + 2), + * but we want multiples of 4, so drop the + 2 */ + pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; + + rc = nvme_ctrlr_construct(&pctrlr->ctrlr); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + pci_id = spdk_pci_device_get_id(pci_dev); + pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); + + rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + /* Construct the primary process properties */ + rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + if (g_sigset != true) { + nvme_pcie_ctrlr_setup_signal(); + g_sigset = true; + } + + return &pctrlr->ctrlr; +} + +int +nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); + union spdk_nvme_aqa_register aqa; + + if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { + SPDK_ERRLOG("set_asq() failed\n"); + return -EIO; + } + + if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { + SPDK_ERRLOG("set_acq() failed\n"); + return -EIO; + } + + aqa.raw = 0; + /* acqs and asqs are 0-based. */ + aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + + if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { + SPDK_ERRLOG("set_aqa() failed\n"); + return -EIO; + } + + return 0; +} + +int +nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); + + close(pctrlr->claim_fd); + + if (ctrlr->adminq) { + nvme_pcie_qpair_destroy(ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_ctrlr_free_processes(ctrlr); + + nvme_pcie_ctrlr_free_bars(pctrlr); + + if (devhandle) { + spdk_pci_device_detach(devhandle); + } + + spdk_free(pctrlr); + + return 0; +} + +static void +nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) +{ + tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); + tr->cid = cid; + tr->active = false; +} + +int +nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + pqpair->sq_tail = pqpair->cq_head = 0; + + /* + * First time through the completion queue, HW will set phase + * bit on completions to 1. So set this to 1 here, indicating + * we're looking for a 1 to know which entries have completed. + * we'll toggle the bit each time when the completion queue + * rolls over. + */ + pqpair->phase = 1; + + memset(pqpair->cmd, 0, + pqpair->num_entries * sizeof(struct spdk_nvme_cmd)); + memset(pqpair->cpl, 0, + pqpair->num_entries * sizeof(struct spdk_nvme_cpl)); + + return 0; +} + +static int +nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + uint16_t i; + volatile uint32_t *doorbell_base; + uint64_t offset; + uint16_t num_trackers; + size_t page_align = 0x200000; + uint32_t flags = SPDK_MALLOC_DMA; + + /* + * Limit the maximum number of completions to return per call to prevent wraparound, + * and calculate how many trackers can be submitted at once without overflowing the + * completion queue. + */ + pqpair->max_completions_cap = pqpair->num_entries / 4; + pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); + pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); + num_trackers = pqpair->num_entries - pqpair->max_completions_cap; + + SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", + pqpair->max_completions_cap, num_trackers); + + assert(num_trackers != 0); + + pqpair->sq_in_cmb = false; + + if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { + flags |= SPDK_MALLOC_SHARE; + } + + /* cmd and cpl rings must be aligned on page size boundaries. */ + if (ctrlr->opts.use_cmb_sqs) { + if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), + sysconf(_SC_PAGESIZE), &offset) == 0) { + pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset; + pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset; + pqpair->sq_in_cmb = true; + } + } + + /* To ensure physical address contiguity we make each ring occupy + * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. + */ + if (pqpair->sq_in_cmb == false) { + pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd), + page_align, &pqpair->cmd_bus_addr, + SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cmd == NULL) { + SPDK_ERRLOG("alloc qpair_cmd failed\n"); + return -ENOMEM; + } + } + + pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl), + page_align, &pqpair->cpl_bus_addr, + SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cpl == NULL) { + SPDK_ERRLOG("alloc qpair_cpl failed\n"); + return -ENOMEM; + } + + doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; + pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; + pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; + + /* + * Reserve space for all of the trackers in a single allocation. + * struct nvme_tracker must be padded so that its size is already a power of 2. + * This ensures the PRP list embedded in the nvme_tracker object will not span a + * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. + */ + pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair->tr == NULL) { + SPDK_ERRLOG("nvme_tr failed\n"); + return -ENOMEM; + } + + TAILQ_INIT(&pqpair->free_tr); + TAILQ_INIT(&pqpair->outstanding_tr); + + for (i = 0; i < num_trackers; i++) { + tr = &pqpair->tr[i]; + nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr)); + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + } + + nvme_pcie_qpair_reset(qpair); + + return 0; +} + +static inline void +nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + /* dst and src are known to be non-overlapping and 64-byte aligned. */ +#if defined(__AVX__) + __m256i *d256 = (__m256i *)dst; + const __m256i *s256 = (const __m256i *)src; + + _mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0])); + _mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1])); +#elif defined(__SSE2__) + __m128i *d128 = (__m128i *)dst; + const __m128i *s128 = (const __m128i *)src; + + _mm_store_si128(&d128[0], _mm_load_si128(&s128[0])); + _mm_store_si128(&d128[1], _mm_load_si128(&s128[1])); + _mm_store_si128(&d128[2], _mm_load_si128(&s128[2])); + _mm_store_si128(&d128[3], _mm_load_si128(&s128[3])); +#else + *dst = *src; +#endif +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *active_req = req; + struct spdk_nvme_ctrlr_process *active_proc; + + /* + * The admin request is from another process. Move to the per + * process list for that process to handle it later. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + assert(active_req->pid != getpid()); + + active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid); + if (active_proc) { + /* Save the original completion information */ + memcpy(&active_req->cpl, cpl, sizeof(*cpl)); + STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); + } else { + SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", + active_req->pid); + + nvme_free_request(active_req); + } +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *req, *tmp_req; + pid_t pid = getpid(); + struct spdk_nvme_ctrlr_process *proc; + + /* + * Check whether there is any pending admin request from + * other active processes. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + + proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + if (!proc) { + SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); + assert(proc); + return; + } + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == pid); + + nvme_complete_request(req, &req->cpl); + nvme_free_request(req); + } +} + +static inline int +nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); +} + +static bool +nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, + volatile uint32_t *shadow_db, + volatile uint32_t *eventidx) +{ + uint16_t old; + + if (!shadow_db) { + return true; + } + + old = *shadow_db; + *shadow_db = value; + + if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { + return false; + } + + return true; +} + +static void +nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + struct nvme_request *req; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + + req = tr->req; + assert(req != NULL); + req->timed_out = false; + if (spdk_unlikely(pctrlr->ctrlr.timeout_enabled)) { + req->submit_tick = spdk_get_ticks(); + } else { + req->submit_tick = 0; + } + + pqpair->tr[tr->cid].active = true; + + /* Copy the command from the tracker to the submission queue. */ + nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); + + if (++pqpair->sq_tail == pqpair->num_entries) { + pqpair->sq_tail = 0; + } + + if (pqpair->sq_tail == pqpair->sq_head) { + SPDK_ERRLOG("sq_tail is passing sq_head!\n"); + } + + spdk_wmb(); + if (spdk_likely(nvme_pcie_qpair_update_mmio_required(qpair, + pqpair->sq_tail, + pqpair->sq_shadow_tdbl, + pqpair->sq_eventidx))) { + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); + g_thread_mmio_ctrlr = NULL; + } +} + +static void +nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + struct spdk_nvme_cpl *cpl, bool print_on_error) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_request *req; + bool retry, error, was_active; + bool req_from_current_proc = true; + + req = tr->req; + + assert(req != NULL); + + error = spdk_nvme_cpl_is_error(cpl); + retry = error && nvme_completion_is_retry(cpl) && + req->retries < spdk_nvme_retry_count; + + if (error && print_on_error) { + nvme_qpair_print_command(qpair, &req->cmd); + nvme_qpair_print_completion(qpair, cpl); + } + + was_active = pqpair->tr[cpl->cid].active; + pqpair->tr[cpl->cid].active = false; + + assert(cpl->cid == req->cmd.cid); + + if (retry) { + req->retries++; + nvme_pcie_qpair_submit_tracker(qpair, tr); + } else { + if (was_active) { + /* Only check admin requests from different processes. */ + if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { + req_from_current_proc = false; + nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); + } else { + nvme_complete_request(req, cpl); + } + } + + if (req_from_current_proc == true) { + nvme_free_request(req); + } + + tr->req = NULL; + + TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + + /* + * If the controller is in the middle of resetting, don't + * try to submit queued requests here - let the reset logic + * handle that instead. + */ + if (!STAILQ_EMPTY(&qpair->queued_req) && + !qpair->ctrlr->is_resetting) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + nvme_qpair_submit_request(qpair, req); + } + } +} + +static void +nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, + struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, + bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.cid = tr->cid; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); +} + +static void +nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *temp; + + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { + SPDK_ERRLOG("aborting outstanding command\n"); + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + } +} + +static void +nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + while (tr != NULL) { + assert(tr->req != NULL); + if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, + SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, + false); + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + } else { + tr = TAILQ_NEXT(tr, tq_list); + } + } +} + +static void +nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + nvme_pcie_admin_qpair_abort_aers(qpair); +} + +static int +nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_pcie_admin_qpair_destroy(qpair); + } + if (pqpair->cmd && !pqpair->sq_in_cmb) { + spdk_free(pqpair->cmd); + } + if (pqpair->cpl) { + spdk_free(pqpair->cpl); + } + if (pqpair->tr) { + spdk_free(pqpair->tr); + } + + nvme_qpair_deinit(qpair); + + spdk_free(pqpair); + + return 0; +} + +static void +nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + /* + * Manually abort each outstanding admin command. Do not retry + * admin commands found here, since they will be left over from + * a controller reset and its likely the context in which the + * command was issued no longer applies. + */ + nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); +} + +static void +nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + /* Manually abort each outstanding I/O. */ + nvme_pcie_qpair_abort_trackers(qpair, 0); +} + +int +nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + pqpair->is_enabled = true; + if (nvme_qpair_is_io_queue(qpair)) { + nvme_pcie_io_qpair_enable(qpair); + } else { + nvme_pcie_admin_qpair_enable(qpair); + } + + return 0; +} + +static void +nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair) +{ + nvme_pcie_admin_qpair_abort_aers(qpair); +} + +static void +nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair) +{ +} + +int +nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + pqpair->is_enabled = false; + if (nvme_qpair_is_io_queue(qpair)) { + nvme_pcie_io_qpair_disable(qpair); + } else { + nvme_pcie_admin_qpair_disable(qpair); + } + + return 0; +} + + +int +nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair) +{ + nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */); + + return 0; +} + +static int +nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; + + /* + * TODO: create a create io completion queue command data + * structure. + */ + cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; + /* + * 0x2 = interrupts enabled + * 0x1 = physically contiguous + */ + cmd->cdw11 = 0x1; + cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; + + /* + * TODO: create a create io submission queue command data + * structure. + */ + cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id; + /* 0x1 = physically contiguous */ + cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1; + cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; + cmd->cdw10 = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; + cmd->cdw10 = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t qid) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_completion_poll_status status; + int rc; + + rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); + if (rc != 0) { + return rc; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + SPDK_ERRLOG("nvme_create_io_cq failed!\n"); + return -1; + } + + rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); + if (rc != 0) { + return rc; + } + + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + SPDK_ERRLOG("nvme_create_io_sq failed!\n"); + /* Attempt to delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status); + if (rc != 0) { + return -1; + } + spdk_nvme_wait_for_completion(ctrlr->adminq, &status); + return -1; + } + + if (ctrlr->shadow_doorbell) { + pqpair->sq_shadow_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; + pqpair->cq_shadow_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; + pqpair->sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; + pqpair->cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; + } + nvme_pcie_qpair_reset(qpair); + + return 0; +} + +struct spdk_nvme_qpair * +nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct nvme_pcie_qpair *pqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + assert(ctrlr != NULL); + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return NULL; + } + + pqpair->num_entries = opts->io_queue_size; + + qpair = &pqpair->qpair; + + rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + rc = nvme_pcie_qpair_construct(qpair); + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid); + + if (rc != 0) { + SPDK_ERRLOG("I/O queue creation failed\n"); + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + return qpair; +} + +int +nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); +} + +int +nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_completion_poll_status status; + int rc; + + assert(ctrlr != NULL); + + if (ctrlr->is_removed) { + goto free; + } + + /* Delete the I/O submission queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); + if (rc != 0) { + return rc; + } + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + return -1; + } + + if (qpair->no_deletion_notification_needed == 0) { + /* Complete any I/O in the completion queue */ + nvme_pcie_qpair_process_completions(qpair, 0); + + /* Abort the rest of the I/O */ + nvme_pcie_qpair_abort_trackers(qpair, 1); + } + + /* Delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); + if (rc != 0) { + return rc; + } + if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) { + return -1; + } + +free: + nvme_pcie_qpair_destroy(qpair); + return 0; +} + +static void +nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + /* + * Bad vtophys translation, so abort this request and return + * immediately. + */ + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INVALID_FIELD, + 1 /* do not retry */, true); +} + +/* + * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. + * + * *prp_index will be updated to account for the number of PRP entries used. + */ +static int +nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, + uint32_t page_size) +{ + struct spdk_nvme_cmd *cmd = &tr->req->cmd; + uintptr_t page_mask = page_size - 1; + uint64_t phys_addr; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", + *prp_index, virt_addr, (uint32_t)len); + + if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + return -EINVAL; + } + + i = *prp_index; + while (len) { + uint32_t seg_len; + + /* + * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, + * so prp_index == count is valid. + */ + if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { + SPDK_ERRLOG("out of PRP entries\n"); + return -EINVAL; + } + + phys_addr = spdk_vtophys(virt_addr); + if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { + SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); + return -EINVAL; + } + + if (i == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); + cmd->dptr.prp.prp1 = phys_addr; + seg_len = page_size - ((uintptr_t)virt_addr & page_mask); + } else { + if ((phys_addr & page_mask) != 0) { + SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); + return -EINVAL; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); + tr->u.prp[i - 1] = phys_addr; + seg_len = page_size; + } + + seg_len = spdk_min(seg_len, len); + virt_addr += seg_len; + len -= seg_len; + i++; + } + + cmd->psdt = SPDK_NVME_PSDT_PRP; + if (i <= 1) { + cmd->dptr.prp.prp2 = 0; + } else if (i == 2) { + cmd->dptr.prp.prp2 = tr->u.prp[0]; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); + } else { + cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); + } + + *prp_index = i; + return 0; +} + +/** + * Build PRP list describing physically contiguous payload buffer. + */ +static int +nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr) +{ + uint32_t prp_index = 0; + int rc; + + rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, + req->payload_size, qpair->ctrlr->page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return rc; + } + + return 0; +} + +#define _2MB_OFFSET(ptr) (((uintptr_t)(ptr)) & (0x200000 - 1)) + +/** + * Build SGL list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr) +{ + int rc; + void *virt_addr; + uint64_t phys_addr; + uint32_t remaining_transfer_len, remaining_user_sge_len, length; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t nseg = 0; + + /* + * Build scattered payloads. + */ + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + sgl = tr->u.sgl; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.subtype = 0; + + remaining_transfer_len = req->payload_size; + + while (remaining_transfer_len > 0) { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, + &virt_addr, &remaining_user_sge_len); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -1; + } + + remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); + remaining_transfer_len -= remaining_user_sge_len; + while (remaining_user_sge_len > 0) { + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -1; + } + + phys_addr = spdk_vtophys(virt_addr); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -1; + } + + length = spdk_min(remaining_user_sge_len, 0x200000 - _2MB_OFFSET(virt_addr)); + remaining_user_sge_len -= length; + virt_addr += length; + + if (nseg > 0 && phys_addr == + (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { + /* extend previous entry */ + (*(sgl - 1)).unkeyed.length += length; + continue; + } + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + sgl->unkeyed.length = length; + sgl->address = phys_addr; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + } + } + + if (nseg == 1) { + /* + * The whole transfer can be described by a single SGL descriptor. + * Use the special case described by the spec where SGL1's type is Data Block. + * This means the SGL in the tracker is not used at all, so copy the first (and only) + * SGL element into SGL1. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; + req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; + } else { + /* For now we can only support 1 SGL segment in NVMe controller */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; + req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); + } + + return 0; +} + +/** + * Build PRP list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr) +{ + int rc; + void *virt_addr; + uint32_t remaining_transfer_len, length; + uint32_t prp_index = 0; + uint32_t page_size = qpair->ctrlr->page_size; + + /* + * Build scattered payloads. + */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + remaining_transfer_len = req->payload_size; + while (remaining_transfer_len > 0) { + assert(req->payload.next_sge_fn != NULL); + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -1; + } + + length = spdk_min(remaining_transfer_len, length); + + /* + * Any incompatible sges should have been handled up in the splitting routine, + * but assert here as an additional check. + * + * All SGEs except last must end on a page boundary. + */ + assert((length == remaining_transfer_len) || + _is_page_aligned((uintptr_t)virt_addr + length, page_size)); + + rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return rc; + } + + remaining_transfer_len -= length; + } + + return 0; +} + +static inline bool +nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + if (!pqpair->is_enabled && + !qpair->ctrlr->is_resetting) { + nvme_qpair_enable(qpair); + } + return pqpair->is_enabled; +} + +int +nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + struct nvme_tracker *tr; + int rc = 0; + void *md_payload; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + nvme_pcie_qpair_check_enabled(qpair); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + tr = TAILQ_FIRST(&pqpair->free_tr); + + if (tr == NULL || !pqpair->is_enabled) { + /* + * No tracker is available, or the qpair is disabled due to + * an in-progress controller-level reset. + * + * Put the request on the qpair's request queue to be + * processed when a tracker frees up via a command + * completion or when the controller reset is + * completed. + */ + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + goto exit; + } + + TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ + TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); + tr->req = req; + req->cmd.cid = tr->cid; + + if (req->payload_size && req->payload.md) { + md_payload = req->payload.md + req->md_offset; + tr->req->cmd.mptr = spdk_vtophys(md_payload); + if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + rc = -EINVAL; + goto exit; + } + } + + if (req->payload_size == 0) { + /* Null payload - leave PRP fields zeroed */ + rc = 0; + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { + rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { + if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { + rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr); + } else { + rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr); + } + } else { + assert(0); + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + rc = -EINVAL; + } + + if (rc < 0) { + goto exit; + } + + nvme_pcie_qpair_submit_tracker(qpair, tr); + +exit: + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return rc; +} + +static void +nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct nvme_tracker *tr, *tmp; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +int32_t +nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + struct nvme_tracker *tr; + struct spdk_nvme_cpl *cpl; + uint32_t num_completions = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + if (spdk_unlikely(!nvme_pcie_qpair_check_enabled(qpair))) { + /* + * qpair is not enabled, likely because a controller reset is + * is in progress. Ignore the interrupt - any I/O that was + * associated with this interrupt will get retried when the + * reset is complete. + */ + return 0; + } + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { + /* + * max_completions == 0 means unlimited, but complete at most + * max_completions_cap batch of I/O at a time so that the completion + * queue doorbells don't wrap around. + */ + max_completions = pqpair->max_completions_cap; + } + + while (1) { + cpl = &pqpair->cpl[pqpair->cq_head]; + + if (cpl->status.p != pqpair->phase) { + break; + } +#ifdef __PPC64__ + /* + * This memory barrier prevents reordering of: + * - load after store from/to tr + * - load after load cpl phase and cpl cid + */ + spdk_mb(); +#endif + + if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { + pqpair->cq_head = 0; + pqpair->phase = !pqpair->phase; + } + + tr = &pqpair->tr[cpl->cid]; + pqpair->sq_head = cpl->sqhd; + + if (tr->active) { + nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); + } else { + SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); + nvme_qpair_print_completion(qpair, cpl); + assert(0); + } + + if (++num_completions == max_completions) { + break; + } + } + + if (num_completions > 0) { + if (spdk_likely(nvme_pcie_qpair_update_mmio_required(qpair, pqpair->cq_head, + pqpair->cq_shadow_hdbl, + pqpair->cq_eventidx))) { + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); + g_thread_mmio_ctrlr = NULL; + } + } + + if (spdk_unlikely(ctrlr->timeout_enabled)) { + /* + * User registered for timeout callback + */ + nvme_pcie_qpair_check_timeout(qpair); + } + + /* Before returning, complete any pending admin request. */ + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_pcie_qpair_complete_pending_admin_request(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return num_completions; +} diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c new file mode 100644 index 00000000..9f585798 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_qpair.c @@ -0,0 +1,663 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" +#include "spdk/nvme_ocssd.h" + +static void nvme_qpair_fail(struct spdk_nvme_qpair *qpair); + +struct nvme_string { + uint16_t value; + const char *str; +}; + +static const struct nvme_string admin_opcode[] = { + { SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, + { SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, + { SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, + { SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, + { SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, + { SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" }, + { SPDK_NVME_OPC_ABORT, "ABORT" }, + { SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" }, + { SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" }, + { SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, + { SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" }, + { SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" }, + { SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, + { SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" }, + { SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" }, + { SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" }, + { SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" }, + { SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" }, + { SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" }, + { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, + { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, + { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, + { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, + { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, + { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, + { SPDK_NVME_OPC_SANITIZE, "SANITIZE" }, + { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" }, + { 0xFFFF, "ADMIN COMMAND" } +}; + +static const struct nvme_string io_opcode[] = { + { SPDK_NVME_OPC_FLUSH, "FLUSH" }, + { SPDK_NVME_OPC_WRITE, "WRITE" }, + { SPDK_NVME_OPC_READ, "READ" }, + { SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, + { SPDK_NVME_OPC_COMPARE, "COMPARE" }, + { SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" }, + { SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, + { SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" }, + { SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" }, + { SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" }, + { SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" }, + { SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" }, + { SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" }, + { SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" }, + { SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" }, + { 0xFFFF, "IO COMMAND" } +}; + +static const char * +nvme_get_string(const struct nvme_string *strings, uint16_t value) +{ + const struct nvme_string *entry; + + entry = strings; + + while (entry->value != 0xFFFF) { + if (entry->value == value) { + return entry->str; + } + entry++; + } + return entry->str; +} + +static void +nvme_admin_qpair_print_command(struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd) +{ + + SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%x " + "cdw10:%08x cdw11:%08x\n", + nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qpair->id, cmd->cid, + cmd->nsid, cmd->cdw10, cmd->cdw11); +} + +static void +nvme_io_qpair_print_command(struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd) +{ + assert(qpair != NULL); + assert(cmd != NULL); + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_WRITE: + case SPDK_NVME_OPC_READ: + case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: + case SPDK_NVME_OPC_COMPARE: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d " + "lba:%llu len:%d\n", + nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid, + cmd->nsid, + ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, + (cmd->cdw12 & 0xFFFF) + 1); + break; + case SPDK_NVME_OPC_FLUSH: + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid, + cmd->nsid); + break; + default: + SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), cmd->opc, qpair->id, + cmd->cid, cmd->nsid); + break; + } +} + +void +nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd) +{ + assert(qpair != NULL); + assert(cmd != NULL); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_admin_qpair_print_command(qpair, cmd); + } else { + nvme_io_qpair_print_command(qpair, cmd); + } +} + +static const struct nvme_string generic_status[] = { + { SPDK_NVME_SC_SUCCESS, "SUCCESS" }, + { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, + { SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" }, + { SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, + { SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, + { SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, + { SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, + { SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, + { SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, + { SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, + { SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, + { SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, + { SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" }, + { SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" }, + { SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, + { SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" }, + { SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" }, + { SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, + { SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, + { SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" }, + { SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" }, + { SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" }, + { SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" }, + { SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" }, + { SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, + { SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, + { SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" }, + { SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, + { SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, + { SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, + { SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, + { SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, + { SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, + { 0xFFFF, "GENERIC" } +}; + +static const struct nvme_string command_specific_status[] = { + { SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, + { SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, + { SPDK_NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, + { SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, + { SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, + { SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, + { SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, + { SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, + { SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" }, + { SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, + { SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, + { SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" }, + { SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, + { SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, + { SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, + { SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" }, + { SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, + { SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, + { SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" }, + { SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" }, + { SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" }, + { SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" }, + { SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" }, + { SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" }, + { SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, + { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" }, + { SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, + { SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, + { SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, + { SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, + { 0xFFFF, "COMMAND SPECIFIC" } +}; + +static const struct nvme_string media_error_status[] = { + { SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, + { SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, + { SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, + { SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, + { SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, + { SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, + { SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, + { SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" }, + { SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" }, + { SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" }, + { SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" }, + { SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" }, + { SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" }, + { SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" }, + { 0xFFFF, "MEDIA ERROR" } +}; + +static const struct nvme_string path_status[] = { + { SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" }, + { SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" }, + { SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" }, + { 0xFFFF, "PATH ERROR" } +}; + +static const char * +get_status_string(uint16_t sct, uint16_t sc) +{ + const struct nvme_string *entry; + + switch (sct) { + case SPDK_NVME_SCT_GENERIC: + entry = generic_status; + break; + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + entry = command_specific_status; + break; + case SPDK_NVME_SCT_MEDIA_ERROR: + entry = media_error_status; + break; + case SPDK_NVME_SCT_PATH: + entry = path_status; + break; + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + return "VENDOR SPECIFIC"; + default: + return "RESERVED"; + } + + return nvme_get_string(entry, sc); +} + +void +nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cpl *cpl) +{ + SPDK_NOTICELOG("%s (%02x/%02x) sqid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n", + get_status_string(cpl->status.sct, cpl->status.sc), + cpl->status.sct, cpl->status.sc, cpl->sqid, cpl->cid, cpl->cdw0, + cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr); +} + +bool +nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl) +{ + /* + * TODO: spec is not clear how commands that are aborted due + * to TLER will be marked. So for now, it seems + * NAMESPACE_NOT_READY is the only case where we should + * look at the DNR bit. + */ + switch ((int)cpl->status.sct) { + case SPDK_NVME_SCT_GENERIC: + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_NAMESPACE_NOT_READY: + case SPDK_NVME_SC_FORMAT_IN_PROGRESS: + if (cpl->status.dnr) { + return false; + } else { + return true; + } + case SPDK_NVME_SC_INVALID_OPCODE: + case SPDK_NVME_SC_INVALID_FIELD: + case SPDK_NVME_SC_COMMAND_ID_CONFLICT: + case SPDK_NVME_SC_DATA_TRANSFER_ERROR: + case SPDK_NVME_SC_ABORTED_POWER_LOSS: + case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR: + case SPDK_NVME_SC_ABORTED_BY_REQUEST: + case SPDK_NVME_SC_ABORTED_SQ_DELETION: + case SPDK_NVME_SC_ABORTED_FAILED_FUSED: + case SPDK_NVME_SC_ABORTED_MISSING_FUSED: + case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT: + case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR: + case SPDK_NVME_SC_LBA_OUT_OF_RANGE: + case SPDK_NVME_SC_CAPACITY_EXCEEDED: + default: + return false; + } + case SPDK_NVME_SCT_PATH: + /* + * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be + * based on the setting of the DNR bit for Internal Path Error + */ + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_INTERNAL_PATH_ERROR: + return !cpl->status.dnr; + default: + return false; + } + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + case SPDK_NVME_SCT_MEDIA_ERROR: + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + default: + return false; + } +} + +static void +nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, uint32_t sct, uint32_t sc, + bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + bool error; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.status.sct = sct; + cpl.status.sc = sc; + + error = spdk_nvme_cpl_is_error(&cpl); + + if (error && print_on_error) { + SPDK_NOTICELOG("Command completed manually:\n"); + nvme_qpair_print_command(qpair, &req->cmd); + nvme_qpair_print_completion(qpair, &cpl); + } + + nvme_complete_request(req, &cpl); + nvme_free_request(req); +} + +int32_t +spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + int32_t ret; + struct nvme_request *req, *tmp; + + if (qpair->ctrlr->is_failed) { + nvme_qpair_fail(qpair); + return 0; + } + + /* error injection for those queued error requests */ + if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) { + STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) { + if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) { + STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, true); + } + } + } + + qpair->in_completion_context = 1; + ret = nvme_transport_qpair_process_completions(qpair, max_completions); + qpair->in_completion_context = 0; + if (qpair->delete_after_completion_context) { + /* + * A request to delete this qpair was made in the context of this completion + * routine - so it is safe to delete it now. + */ + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + return ret; +} + +int +nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + size_t req_size_padded; + uint32_t i; + + qpair->id = id; + qpair->qprio = qprio; + + qpair->in_completion_context = 0; + qpair->delete_after_completion_context = 0; + qpair->no_deletion_notification_needed = 0; + + qpair->ctrlr = ctrlr; + qpair->trtype = ctrlr->trid.trtype; + + STAILQ_INIT(&qpair->free_req); + STAILQ_INIT(&qpair->queued_req); + TAILQ_INIT(&qpair->err_cmd_head); + STAILQ_INIT(&qpair->err_req_head); + + req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63; + + qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (qpair->req_buf == NULL) { + SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n", + ctrlr->cntlid, qpair->id, num_requests); + return -ENOMEM; + } + + for (i = 0; i < num_requests; i++) { + struct nvme_request *req = qpair->req_buf + i * req_size_padded; + + STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq); + } + + return 0; +} + +void +nvme_qpair_deinit(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + struct nvme_error_cmd *cmd, *entry; + + while (!STAILQ_EMPTY(&qpair->err_req_head)) { + req = STAILQ_FIRST(&qpair->err_req_head); + STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, true); + } + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_dma_free(cmd); + } + + spdk_dma_free(qpair->req_buf); +} + +int +nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc = 0; + struct nvme_request *child_req, *tmp; + struct nvme_error_cmd *cmd; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + bool child_req_failed = false; + + if (ctrlr->is_failed) { + nvme_free_request(req); + return -ENXIO; + } + + if (req->num_children) { + /* + * This is a split (parent) request. Submit all of the children but not the parent + * request itself, since the parent is the original unsplit request. + */ + TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) { + if (!child_req_failed) { + rc = nvme_qpair_submit_request(qpair, child_req); + if (rc != 0) { + child_req_failed = true; + } + } else { /* free remaining child_reqs since one child_req fails */ + nvme_request_remove_child(req, child_req); + nvme_free_request(child_req); + } + } + + return rc; + } + + /* queue those requests which matches with opcode in err_cmd list */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + if (!cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + /* add to error request list and set cpl */ + req->timeout_tsc = cmd->timeout_tsc; + req->submit_tick = spdk_get_ticks(); + req->cpl.status.sct = cmd->status.sct; + req->cpl.status.sc = cmd->status.sc; + STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq); + cmd->err_count--; + return 0; + } + } + } + + return nvme_transport_qpair_submit_request(qpair, req); +} + +static void +_nvme_io_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + /* Manually abort each queued I/O. */ + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + SPDK_ERRLOG("aborting queued i/o\n"); + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, true); + } +} + +void +nvme_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + if (nvme_qpair_is_io_queue(qpair)) { + _nvme_io_qpair_enable(qpair); + } + + nvme_transport_qpair_enable(qpair); +} + +void +nvme_qpair_disable(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->err_req_head)) { + req = STAILQ_FIRST(&qpair->err_req_head); + STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, true); + } + + nvme_transport_qpair_disable(qpair); +} + +static void +nvme_qpair_fail(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + SPDK_ERRLOG("failing queued i/o\n"); + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, true); + } + + nvme_transport_qpair_fail(qpair); +} + +int +spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc, bool do_not_submit, + uint64_t timeout_in_us, + uint32_t err_count, + uint8_t sct, uint8_t sc) +{ + struct nvme_error_cmd *entry, *cmd = NULL; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) { + if (entry->opc == opc) { + cmd = entry; + break; + } + } + + if (cmd == NULL) { + cmd = spdk_dma_zmalloc(sizeof(*cmd), 64, NULL); + if (!cmd) { + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link); + } + + cmd->do_not_submit = do_not_submit; + cmd->err_count = err_count; + cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL; + cmd->opc = opc; + cmd->status.sct = sct; + cmd->status.sc = sc; + + return 0; +} + +void +spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc) +{ + struct nvme_error_cmd *cmd, *entry; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + if (cmd->opc == opc) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_dma_free(cmd); + return; + } + } + + return; +} diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c new file mode 100644 index 00000000..9a213b12 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_quirks.c @@ -0,0 +1,141 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +struct nvme_quirk { + struct spdk_pci_id id; + uint64_t flags; +}; + +static const struct nvme_quirk nvme_quirks[] = { + { {SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE + }, + { {SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE + }, + { {SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE + }, + { {SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE + }, + { {SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC + }, + { {SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_INTEL_QUIRK_NO_LOG_PAGES + }, + { {SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_OCSSD + }, + { {0x0000, 0x0000, 0x0000, 0x0000}, 0} +}; + +/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */ +static bool +pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2) +{ + if ((s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) && + (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) && + (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) && + (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) { + return true; + } + return false; +} + +uint64_t +nvme_get_quirks(const struct spdk_pci_id *id) +{ + const struct nvme_quirk *quirk = nvme_quirks; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n", + id->vendor_id, id->device_id, + id->subvendor_id, id->subdevice_id); + + while (quirk->id.vendor_id) { + if (pci_id_match(&quirk->id, id)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n", + quirk->id.vendor_id, quirk->id.device_id, + quirk->id.subvendor_id, quirk->id.subdevice_id); + +#define PRINT_QUIRK(quirk_flag) \ + do { \ + if (quirk->flags & (quirk_flag)) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \ + } \ + } while (0) + + PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY); + PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY); + PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY); + PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING); + PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC); + PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE); + PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS); + PRINT_QUIRK(NVME_QUIRK_OCSSD); + + return quirk->flags; + } + quirk++; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n"); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c new file mode 100644 index 00000000..b356e3a1 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_rdma.c @@ -0,0 +1,1634 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over RDMA transport + */ + +#include "spdk/stdinc.h" + +#include +#include +#include + +#include "spdk/assert.h" +#include "spdk/log.h" +#include "spdk/trace.h" +#include "spdk/event.h" +#include "spdk/queue.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/endian.h" +#include "spdk/likely.h" + +#include "nvme_internal.h" + +#define NVME_RDMA_TIME_OUT_IN_MS 2000 +#define NVME_RDMA_RW_BUFFER_SIZE 131072 + +/* + * NVME RDMA qpair Resource Defaults + */ +#define NVME_RDMA_DEFAULT_TX_SGE 2 +#define NVME_RDMA_DEFAULT_RX_SGE 1 + + +/* Max number of NVMe-oF SGL descriptors supported by the host */ +#define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 +struct spdk_nvmf_cmd { + struct spdk_nvme_cmd cmd; + struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; +}; + +/* Mapping from virtual address to ibv_mr pointer for a protection domain */ +struct spdk_nvme_rdma_mr_map { + struct ibv_pd *pd; + struct spdk_mem_map *map; + uint64_t ref; + LIST_ENTRY(spdk_nvme_rdma_mr_map) link; +}; + +/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ +struct nvme_rdma_ctrlr { + struct spdk_nvme_ctrlr ctrlr; +}; + +/* NVMe RDMA qpair extensions for spdk_nvme_qpair */ +struct nvme_rdma_qpair { + struct spdk_nvme_qpair qpair; + + struct rdma_event_channel *cm_channel; + + struct rdma_cm_id *cm_id; + + struct ibv_cq *cq; + + struct spdk_nvme_rdma_req *rdma_reqs; + + uint16_t num_entries; + + /* Parallel arrays of response buffers + response SGLs of size num_entries */ + struct ibv_sge *rsp_sgls; + struct spdk_nvme_cpl *rsps; + + struct ibv_recv_wr *rsp_recv_wrs; + + /* Memory region describing all rsps for this qpair */ + struct ibv_mr *rsp_mr; + + /* + * Array of num_entries NVMe commands registered as RDMA message buffers. + * Indexed by rdma_req->id. + */ + struct spdk_nvmf_cmd *cmds; + + /* Memory region describing all cmds for this qpair */ + struct ibv_mr *cmd_mr; + + struct spdk_nvme_rdma_mr_map *mr_map; + + TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs; + TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; +}; + +struct spdk_nvme_rdma_req { + int id; + + struct ibv_send_wr send_wr; + + struct nvme_request *req; + + struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE]; + + TAILQ_ENTRY(spdk_nvme_rdma_req) link; +}; + +static const char *rdma_cm_event_str[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; + +static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps); +static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; + +static int nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair); + +static inline struct nvme_rdma_qpair * +nvme_rdma_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair); +} + +static inline struct nvme_rdma_ctrlr * +nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr); +} + +static struct spdk_nvme_rdma_req * +nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_req *rdma_req; + + rdma_req = TAILQ_FIRST(&rqpair->free_reqs); + if (rdma_req) { + TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link); + TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link); + } + + return rdma_req; +} + +static void +nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) +{ + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); +} + +static void +nvme_rdma_req_complete(struct nvme_request *req, + struct spdk_nvme_cpl *rsp) +{ + nvme_complete_request(req, rsp); + nvme_free_request(req); +} + +static const char * +nvme_rdma_cm_event_str_get(uint32_t event) +{ + if (event < SPDK_COUNTOF(rdma_cm_event_str)) { + return rdma_cm_event_str[event]; + } else { + return "Undefined"; + } +} + +static struct rdma_cm_event * +nvme_rdma_get_event(struct rdma_event_channel *channel, + enum rdma_cm_event_type evt) +{ + struct rdma_cm_event *event; + int rc; + + rc = rdma_get_cm_event(channel, &event); + if (rc < 0) { + SPDK_ERRLOG("Failed to get event from CM event channel. Error %d (%s)\n", + errno, spdk_strerror(errno)); + return NULL; + } + + if (event->event != evt) { + SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", + nvme_rdma_cm_event_str_get(evt), + nvme_rdma_cm_event_str_get(event->event), event->event, event->status); + rdma_ack_cm_event(event); + return NULL; + } + + return event; +} + +static int +nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) +{ + int rc; + struct ibv_qp_init_attr attr; + + rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); + if (!rqpair->cq) { + SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + + memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); + attr.qp_type = IBV_QPT_RC; + attr.send_cq = rqpair->cq; + attr.recv_cq = rqpair->cq; + attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */ + attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */ + attr.cap.max_send_sge = NVME_RDMA_DEFAULT_TX_SGE; + attr.cap.max_recv_sge = NVME_RDMA_DEFAULT_RX_SGE; + + rc = rdma_create_qp(rqpair->cm_id, NULL, &attr); + if (rc) { + SPDK_ERRLOG("rdma_create_qp failed\n"); + return -1; + } + + rqpair->cm_id->context = &rqpair->qpair; + + return 0; +} + +#define nvme_rdma_trace_ibv_sge(sg_list) \ + if (sg_list) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \ + (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \ + } + +static int +nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx) +{ + struct ibv_recv_wr *wr, *bad_wr = NULL; + int rc; + + wr = &rqpair->rsp_recv_wrs[rsp_idx]; + nvme_rdma_trace_ibv_sge(wr->sg_list); + + rc = ibv_post_recv(rqpair->cm_id->qp, wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc); + } + + return rc; +} + +static void +nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair) +{ + if (rqpair->rsp_mr && rdma_dereg_mr(rqpair->rsp_mr)) { + SPDK_ERRLOG("Unable to de-register rsp_mr\n"); + } + rqpair->rsp_mr = NULL; + + free(rqpair->rsps); + rqpair->rsps = NULL; + free(rqpair->rsp_sgls); + rqpair->rsp_sgls = NULL; + free(rqpair->rsp_recv_wrs); + rqpair->rsp_recv_wrs = NULL; +} + +static int +nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair) +{ + uint16_t i; + + rqpair->rsp_mr = NULL; + rqpair->rsps = NULL; + rqpair->rsp_recv_wrs = NULL; + + rqpair->rsp_sgls = calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls)); + if (!rqpair->rsp_sgls) { + SPDK_ERRLOG("Failed to allocate rsp_sgls\n"); + goto fail; + } + + rqpair->rsp_recv_wrs = calloc(rqpair->num_entries, + sizeof(*rqpair->rsp_recv_wrs)); + if (!rqpair->rsp_recv_wrs) { + SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n"); + goto fail; + } + + rqpair->rsps = calloc(rqpair->num_entries, sizeof(*rqpair->rsps)); + if (!rqpair->rsps) { + SPDK_ERRLOG("can not allocate rdma rsps\n"); + goto fail; + } + + rqpair->rsp_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->rsps, + rqpair->num_entries * sizeof(*rqpair->rsps)); + if (rqpair->rsp_mr == NULL) { + SPDK_ERRLOG("Unable to register rsp_mr\n"); + goto fail; + } + + for (i = 0; i < rqpair->num_entries; i++) { + struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i]; + + rsp_sgl->addr = (uint64_t)&rqpair->rsps[i]; + rsp_sgl->length = sizeof(rqpair->rsps[i]); + rsp_sgl->lkey = rqpair->rsp_mr->lkey; + + rqpair->rsp_recv_wrs[i].wr_id = i; + rqpair->rsp_recv_wrs[i].next = NULL; + rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl; + rqpair->rsp_recv_wrs[i].num_sge = 1; + + if (nvme_rdma_post_recv(rqpair, i)) { + SPDK_ERRLOG("Unable to post connection rx desc\n"); + goto fail; + } + } + + return 0; + +fail: + nvme_rdma_free_rsps(rqpair); + return -ENOMEM; +} + +static void +nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair) +{ + if (!rqpair->rdma_reqs) { + return; + } + + if (rqpair->cmd_mr && rdma_dereg_mr(rqpair->cmd_mr)) { + SPDK_ERRLOG("Unable to de-register cmd_mr\n"); + } + rqpair->cmd_mr = NULL; + + free(rqpair->cmds); + rqpair->cmds = NULL; + + free(rqpair->rdma_reqs); + rqpair->rdma_reqs = NULL; +} + +static int +nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair) +{ + int i; + + rqpair->rdma_reqs = calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req)); + if (rqpair->rdma_reqs == NULL) { + SPDK_ERRLOG("Failed to allocate rdma_reqs\n"); + goto fail; + } + + rqpair->cmds = calloc(rqpair->num_entries, sizeof(*rqpair->cmds)); + if (!rqpair->cmds) { + SPDK_ERRLOG("Failed to allocate RDMA cmds\n"); + goto fail; + } + + rqpair->cmd_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->cmds, + rqpair->num_entries * sizeof(*rqpair->cmds)); + if (!rqpair->cmd_mr) { + SPDK_ERRLOG("Unable to register cmd_mr\n"); + goto fail; + } + + TAILQ_INIT(&rqpair->free_reqs); + TAILQ_INIT(&rqpair->outstanding_reqs); + for (i = 0; i < rqpair->num_entries; i++) { + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvmf_cmd *cmd; + + rdma_req = &rqpair->rdma_reqs[i]; + cmd = &rqpair->cmds[i]; + + rdma_req->id = i; + + /* The first RDMA sgl element will always point + * at this data structure. Depending on whether + * an NVMe-oF SGL is required, the length of + * this element may change. */ + rdma_req->send_sgl[0].addr = (uint64_t)cmd; + rdma_req->send_sgl[0].lkey = rqpair->cmd_mr->lkey; + + rdma_req->send_wr.wr_id = (uint64_t)rdma_req; + rdma_req->send_wr.next = NULL; + rdma_req->send_wr.opcode = IBV_WR_SEND; + rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->send_wr.sg_list = rdma_req->send_sgl; + rdma_req->send_wr.imm_data = 0; + + TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link); + } + + return 0; + +fail: + nvme_rdma_free_reqs(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_recv(struct nvme_rdma_qpair *rqpair, uint64_t rsp_idx) +{ + struct spdk_nvme_qpair *qpair = &rqpair->qpair; + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvme_cpl *rsp; + struct nvme_request *req; + + assert(rsp_idx < rqpair->num_entries); + rsp = &rqpair->rsps[rsp_idx]; + rdma_req = &rqpair->rdma_reqs[rsp->cid]; + + req = rdma_req->req; + nvme_rdma_req_complete(req, rsp); + + nvme_rdma_req_put(rqpair, rdma_req); + if (nvme_rdma_post_recv(rqpair, rsp_idx)) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + return -1; + } + + if (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->is_resetting) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + nvme_qpair_submit_request(qpair, req); + } + + return 0; +} + +static int +nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_event_channel *cm_channel) +{ + int ret; + struct rdma_cm_event *event; + + ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr, + NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno); + return ret; + } + + event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED); + if (event == NULL) { + SPDK_ERRLOG("RDMA address resolution error\n"); + return -1; + } + rdma_ack_cm_event(event); + + ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_route\n"); + return ret; + } + + event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED); + if (event == NULL) { + SPDK_ERRLOG("RDMA route resolution error\n"); + return -1; + } + rdma_ack_cm_event(event); + + return 0; +} + +static int +nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) +{ + struct rdma_conn_param param = {}; + struct spdk_nvmf_rdma_request_private_data request_data = {}; + struct spdk_nvmf_rdma_accept_private_data *accept_data; + struct ibv_device_attr attr; + int ret; + struct rdma_cm_event *event; + struct spdk_nvme_ctrlr *ctrlr; + + ret = ibv_query_device(rqpair->cm_id->verbs, &attr); + if (ret != 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + return ret; + } + + param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom); + + ctrlr = rqpair->qpair.ctrlr; + if (!ctrlr) { + return -1; + } + + request_data.qid = rqpair->qpair.id; + request_data.hrqsize = rqpair->num_entries; + request_data.hsqsize = rqpair->num_entries - 1; + request_data.cntlid = ctrlr->cntlid; + + param.private_data = &request_data; + param.private_data_len = sizeof(request_data); + param.retry_count = 7; + param.rnr_retry_count = 7; + + ret = rdma_connect(rqpair->cm_id, ¶m); + if (ret) { + SPDK_ERRLOG("nvme rdma connect error\n"); + return ret; + } + + event = nvme_rdma_get_event(rqpair->cm_channel, RDMA_CM_EVENT_ESTABLISHED); + if (event == NULL) { + SPDK_ERRLOG("RDMA connect error\n"); + return -1; + } + + accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data; + if (accept_data == NULL) { + rdma_ack_cm_event(event); + SPDK_ERRLOG("NVMe-oF target did not return accept data\n"); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n", + rqpair->num_entries, accept_data->crqsize); + + rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize); + + rdma_ack_cm_event(event); + + return 0; +} + +static int +nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) +{ + struct addrinfo *res; + struct addrinfo hints; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + ret = getaddrinfo(addr, service, &hints, &res); + if (ret) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); + return ret; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); + ret = EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + + freeaddrinfo(res); + return ret; +} + +static int +nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct ibv_pd *pd = cb_ctx; + struct ibv_mr *mr; + int rc; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -EFAULT; + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + if (mr) { + ibv_dereg_mr(mr); + } + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_pd *pd = rqpair->cm_id->qp->pd; + struct spdk_nvme_rdma_mr_map *mr_map; + const struct spdk_mem_map_ops nvme_rdma_map_ops = { + .notify_cb = nvme_rdma_mr_map_notify, + .are_contiguous = NULL + }; + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + /* Look up existing mem map registration for this pd */ + LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) { + if (mr_map->pd == pd) { + mr_map->ref++; + rqpair->mr_map = mr_map; + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return 0; + } + } + + mr_map = calloc(1, sizeof(*mr_map)); + if (mr_map == NULL) { + SPDK_ERRLOG("calloc() failed\n"); + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + mr_map->ref = 1; + mr_map->pd = pd; + mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd); + if (mr_map->map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + free(mr_map); + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + rqpair->mr_map = mr_map; + LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link); + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + + return 0; +} + +static void +nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_mr_map *mr_map; + + mr_map = rqpair->mr_map; + rqpair->mr_map = NULL; + + if (mr_map == NULL) { + return; + } + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + assert(mr_map->ref > 0); + mr_map->ref--; + if (mr_map->ref == 0) { + LIST_REMOVE(mr_map, link); + spdk_mem_map_free(&mr_map->map); + free(mr_map); + } + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); +} + +static int +nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair) +{ + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + bool src_addr_specified; + int rc; + struct spdk_nvme_ctrlr *ctrlr; + int family; + + rqpair->cm_channel = rdma_create_event_channel(); + if (rqpair->cm_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed\n"); + return -1; + } + + ctrlr = rqpair->qpair.ctrlr; + + switch (ctrlr->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); + + memset(&dst_addr, 0, sizeof(dst_addr)); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); + rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); + if (rc != 0) { + SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + + if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { + memset(&src_addr, 0, sizeof(src_addr)); + rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); + if (rc != 0) { + SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + src_addr_specified = true; + } else { + src_addr_specified = false; + } + + rc = rdma_create_id(rqpair->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + return -1; + } + + rc = nvme_rdma_resolve_addr(rqpair, + src_addr_specified ? (struct sockaddr *)&src_addr : NULL, + (struct sockaddr *)&dst_addr, rqpair->cm_channel); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); + return -1; + } + + rc = nvme_rdma_qpair_init(rqpair); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n"); + return -1; + } + + rc = nvme_rdma_connect(rqpair); + if (rc != 0) { + SPDK_ERRLOG("Unable to connect the rqpair\n"); + return -1; + } + + rc = nvme_rdma_alloc_reqs(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n"); + + rc = nvme_rdma_alloc_rsps(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc < 0) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n"); + + rc = nvme_rdma_register_mem(rqpair); + if (rc < 0) { + SPDK_ERRLOG("Unable to register memory for RDMA\n"); + return -1; + } + + rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); + if (rc < 0) { + SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); + return -1; + } + + return 0; +} + +/* + * Build SGL describing empty payload. + */ +static int +nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = 0; + req->cmd.dptr.sgl1.keyed.key = 0; + req->cmd.dptr.sgl1.address = 0; + + return 0; +} + +/* + * Build inline SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + struct ibv_mr *mr; + void *payload; + uint64_t requested_size; + + payload = req->payload.contig_or_cb_arg + req->payload_offset; + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + requested_size = req->payload_size; + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, + (uint64_t)payload, &requested_size); + + if (mr == NULL || requested_size < req->payload_size) { + return -EINVAL; + } + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + rdma_req->send_sgl[1].addr = (uint64_t)payload; + rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; + rdma_req->send_sgl[1].lkey = mr->lkey; + + /* The RDMA SGL contains two elements. The first describes + * the NVMe command and the second describes the data + * payload. */ + rdma_req->send_wr.num_sge = 2; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +/* + * Build SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + void *payload = req->payload.contig_or_cb_arg + req->payload_offset; + struct ibv_mr *mr; + uint64_t requested_size; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + requested_size = req->payload_size; + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)payload, + &requested_size); + if (mr == NULL || requested_size < req->payload_size) { + return -1; + } + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = req->payload_size; + req->cmd.dptr.sgl1.keyed.key = mr->rkey; + req->cmd.dptr.sgl1.address = (uint64_t)payload; + + return 0; +} + +/* + * Build SGL describing scattered payload buffer. + */ +static int +nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id]; + struct ibv_mr *mr = NULL; + void *virt_addr; + uint64_t remaining_size, mr_length; + uint32_t sge_length; + int rc, max_num_sgl, num_sgl_desc; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + max_num_sgl = req->qpair->ctrlr->max_sges; + + remaining_size = req->payload_size; + num_sgl_desc = 0; + do { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length); + if (rc) { + return -1; + } + + sge_length = spdk_min(remaining_size, sge_length); + mr_length = sge_length; + + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr, + &mr_length); + + if (mr == NULL || mr_length < sge_length) { + return -1; + } + + cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + cmd->sgl[num_sgl_desc].keyed.length = sge_length; + cmd->sgl[num_sgl_desc].keyed.key = mr->rkey; + cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; + + remaining_size -= sge_length; + num_sgl_desc++; + } while (remaining_size > 0 && num_sgl_desc < max_num_sgl); + + + /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ + if (remaining_size > 0) { + return -1; + } + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The RDMA SGL needs one element describing some portion + * of the spdk_nvmf_cmd structure. */ + rdma_req->send_wr.num_sge = 1; + + /* + * If only one SGL descriptor is required, it can be embedded directly in the command + * as a data block descriptor. + */ + if (num_sgl_desc == 1) { + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = req->payload_size; + req->cmd.dptr.sgl1.keyed.key = mr->rkey; + req->cmd.dptr.sgl1.address = rqpair->cmds[rdma_req->id].sgl[0].address; + } else { + /* + * Otherwise, The SGL descriptor embedded in the command must point to the list of + * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. + */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct + spdk_nvme_sgl_descriptor) * num_sgl_desc; + + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor); + req->cmd.dptr.sgl1.address = (uint64_t)0; + } + + return 0; +} + +/* + * Build inline SGL describing sgl payload buffer. + */ +static int +nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + struct ibv_mr *mr; + uint32_t length; + uint64_t requested_size; + void *virt_addr; + int rc; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + /* TODO: for now, we only support a single SGL entry */ + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + return -1; + } + + if (length < req->payload_size) { + SPDK_ERRLOG("multi-element SGL currently not supported for RDMA\n"); + return -1; + } + + requested_size = req->payload_size; + mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr, + &requested_size); + if (mr == NULL || requested_size < req->payload_size) { + return -1; + } + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + rdma_req->send_sgl[1].addr = (uint64_t)virt_addr; + rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; + rdma_req->send_sgl[1].lkey = mr->lkey; + + /* The RDMA SGL contains two elements. The first describes + * the NVMe command and the second describes the data + * payload. */ + rdma_req->send_wr.num_sge = 2; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +static inline unsigned int +nvme_rdma_icdsz_bytes(struct spdk_nvme_ctrlr *ctrlr) +{ + return (ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd)); +} + +static int +nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr; + int rc; + + rdma_req->req = req; + req->cmd.cid = rdma_req->id; + + if (req->payload_size == 0) { + rc = nvme_rdma_build_null_request(rdma_req); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { + /* + * Check if icdoff is non zero, to avoid interop conflicts with + * targets with non-zero icdoff. Both SPDK and the Linux kernel + * targets use icdoff = 0. For targets with non-zero icdoff, we + * will currently just not use inline data for now. + */ + if (req->cmd.opc == SPDK_NVME_OPC_WRITE && + req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) && + (ctrlr->cdata.nvmf_specific.icdoff == 0)) { + rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_contig_request(rqpair, rdma_req); + } + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { + if (req->cmd.opc == SPDK_NVME_OPC_WRITE && + req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) && + ctrlr->cdata.nvmf_specific.icdoff == 0) { + rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_sgl_request(rqpair, rdma_req); + } + } else { + rc = -1; + } + + if (rc) { + return rc; + } + + memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd)); + return 0; +} + +static struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, uint32_t qsize, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + rqpair = calloc(1, sizeof(struct nvme_rdma_qpair)); + if (!rqpair) { + SPDK_ERRLOG("failed to get create rqpair\n"); + return NULL; + } + + rqpair->num_entries = qsize; + + qpair = &rqpair->qpair; + + rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); + if (rc != 0) { + return NULL; + } + + rc = nvme_rdma_qpair_connect(rqpair); + if (rc < 0) { + nvme_rdma_qpair_destroy(qpair); + return NULL; + } + + return qpair; +} + +static int +nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair; + + if (!qpair) { + return -1; + } + nvme_qpair_deinit(qpair); + + rqpair = nvme_rdma_qpair(qpair); + + nvme_rdma_unregister_mem(rqpair); + nvme_rdma_free_reqs(rqpair); + nvme_rdma_free_rsps(rqpair); + + if (rqpair->cm_id) { + if (rqpair->cm_id->qp) { + rdma_destroy_qp(rqpair->cm_id); + } + rdma_destroy_id(rqpair->cm_id); + } + + if (rqpair->cq) { + ibv_destroy_cq(rqpair->cq); + } + + if (rqpair->cm_channel) { + rdma_destroy_event_channel(rqpair->cm_channel); + } + + free(rqpair); + + return 0; +} + +struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, + opts->io_queue_requests); +} + +int +nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + /* do nothing here */ + return 0; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +int +nvme_rdma_ctrlr_scan(const struct spdk_nvme_transport_id *discovery_trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_remove_cb remove_cb, + bool direct_connect) +{ + struct spdk_nvme_ctrlr_opts discovery_opts; + struct spdk_nvme_ctrlr *discovery_ctrlr; + union spdk_nvme_cc_register cc; + int rc; + struct nvme_completion_poll_status status; + + if (strcmp(discovery_trid->subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) { + /* It is not a discovery_ctrlr info and try to directly connect it */ + rc = nvme_ctrlr_probe(discovery_trid, NULL, probe_cb, cb_ctx); + return rc; + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts)); + /* For discovery_ctrlr set the timeout to 0 */ + discovery_opts.keep_alive_timeout_ms = 0; + + discovery_ctrlr = nvme_rdma_ctrlr_construct(discovery_trid, &discovery_opts, NULL); + if (discovery_ctrlr == NULL) { + return -1; + } + + /* TODO: this should be using the normal NVMe controller initialization process */ + cc.raw = 0; + cc.bits.en = 1; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc.raw); + if (rc < 0) { + SPDK_ERRLOG("Failed to set cc\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + return -1; + } + + /* get the cdata info */ + rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata), + nvme_completion_poll_cb, &status); + if (rc != 0) { + SPDK_ERRLOG("Failed to identify cdata\n"); + return rc; + } + + if (spdk_nvme_wait_for_completion(discovery_ctrlr->adminq, &status)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + return -ENXIO; + } + + /* Direct attach through spdk_nvme_connect() API */ + if (direct_connect == true) { + /* Set the ready state to skip the normal init process */ + discovery_ctrlr->state = NVME_CTRLR_STATE_READY; + nvme_ctrlr_connected(discovery_ctrlr); + nvme_ctrlr_add_process(discovery_ctrlr, 0); + return 0; + } + + rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, cb_ctx, probe_cb); + nvme_ctrlr_destruct(discovery_ctrlr); + return rc; +} + +struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_rdma_ctrlr *rctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + int rc; + + rctrlr = calloc(1, sizeof(struct nvme_rdma_ctrlr)); + if (rctrlr == NULL) { + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + rctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_RDMA; + rctrlr->ctrlr.opts = *opts; + memcpy(&rctrlr->ctrlr.trid, trid, sizeof(rctrlr->ctrlr.trid)); + + rc = nvme_ctrlr_construct(&rctrlr->ctrlr); + if (rc != 0) { + free(rctrlr); + return NULL; + } + + rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, + SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES); + if (!rctrlr->ctrlr.adminq) { + SPDK_ERRLOG("failed to create admin qpair\n"); + nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + nvme_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + nvme_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) { + SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); + nvme_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n"); + return &rctrlr->ctrlr; +} + +int +nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + + if (ctrlr->adminq) { + nvme_rdma_qpair_destroy(ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + free(rctrlr); + + return 0; +} + +int +nvme_rdma_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + return nvme_fabric_ctrlr_set_reg_4(ctrlr, offset, value); +} + +int +nvme_rdma_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + return nvme_fabric_ctrlr_set_reg_8(ctrlr, offset, value); +} + +int +nvme_rdma_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + return nvme_fabric_ctrlr_get_reg_4(ctrlr, offset, value); +} + +int +nvme_rdma_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + return nvme_fabric_ctrlr_get_reg_8(ctrlr, offset, value); +} + +int +nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_rdma_req *rdma_req; + struct ibv_send_wr *wr, *bad_wr = NULL; + int rc; + + rqpair = nvme_rdma_qpair(qpair); + assert(rqpair != NULL); + assert(req != NULL); + + rdma_req = nvme_rdma_req_get(rqpair); + if (!rdma_req) { + /* + * No rdma_req is available. Queue the request to be processed later. + */ + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + return 0; + } + + if (nvme_rdma_req_init(rqpair, req, rdma_req)) { + SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); + nvme_rdma_req_put(rqpair, rdma_req); + return -1; + } + + req->timed_out = false; + if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { + req->submit_tick = spdk_get_ticks(); + } else { + req->submit_tick = 0; + } + + wr = &rdma_req->send_wr; + + nvme_rdma_trace_ibv_sge(wr->sg_list); + + rc = ibv_post_send(rqpair->cm_id->qp, wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Failure posting rdma send for NVMf completion: %d (%s)\n", rc, spdk_strerror(rc)); + } + + return rc; +} + +int +nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + return nvme_rdma_qpair_destroy(qpair); +} + +int +nvme_rdma_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + return nvme_rdma_qpair_connect(nvme_rdma_qpair(qpair)); +} + +int +nvme_rdma_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +int +nvme_rdma_qpair_disable(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +int +nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +int +nvme_rdma_qpair_fail(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +static void +nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +#define MAX_COMPLETIONS_PER_POLL 128 + +int +nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; + int i, rc, batch_size; + uint32_t reaped; + struct ibv_cq *cq; + + if (max_completions == 0) { + max_completions = rqpair->num_entries; + } else { + max_completions = spdk_min(max_completions, rqpair->num_entries); + } + + cq = rqpair->cq; + + reaped = 0; + do { + batch_size = spdk_min((max_completions - reaped), + MAX_COMPLETIONS_PER_POLL); + rc = ibv_poll_cq(cq, batch_size, wc); + if (rc < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -1; + } else if (rc == 0) { + /* Ran out of completions */ + break; + } + + for (i = 0; i < rc; i++) { + if (wc[i].status) { + SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", + qpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + return -1; + } + + switch (wc[i].opcode) { + case IBV_WC_RECV: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n"); + + reaped++; + + if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len); + return -1; + } + + if (nvme_rdma_recv(rqpair, wc[i].wr_id)) { + SPDK_ERRLOG("nvme_rdma_recv processing failure\n"); + return -1; + } + break; + + case IBV_WC_SEND: + break; + + default: + SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", wc[i].opcode); + return -1; + } + } + } while (reaped < max_completions); + + if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { + nvme_rdma_qpair_check_timeout(qpair); + } + + return reaped; +} + +uint32_t +nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* Todo, which should get from the NVMF target */ + return NVME_RDMA_RW_BUFFER_SIZE; +} + +uint16_t +nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + return spdk_min(ctrlr->cdata.nvmf_specific.msdbd, NVME_RDMA_MAX_SGL_DESCRIPTORS); +} + +void * +nvme_rdma_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) +{ + return NULL; +} + +int +nvme_rdma_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) +{ + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c new file mode 100644 index 00000000..56052a0f --- /dev/null +++ b/src/spdk/lib/nvme/nvme_transport.c @@ -0,0 +1,219 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe transport abstraction + */ + +#include "nvme_internal.h" + +#ifdef DEBUG +static __attribute__((noreturn)) void +nvme_transport_unknown(enum spdk_nvme_transport_type trtype) +{ + SPDK_ERRLOG("Unknown transport %d\n", (int)trtype); + abort(); +} +#define TRANSPORT_DEFAULT(trtype) default: nvme_transport_unknown(trtype); +#else +#define TRANSPORT_DEFAULT(trtype) +#endif + +#define TRANSPORT_PCIE(func_name, args) case SPDK_NVME_TRANSPORT_PCIE: return nvme_pcie_ ## func_name args; +#ifdef SPDK_CONFIG_RDMA +#define TRANSPORT_FABRICS_RDMA(func_name, args) case SPDK_NVME_TRANSPORT_RDMA: return nvme_rdma_ ## func_name args; +#define TRANSPORT_RDMA_AVAILABLE true +#else +#define TRANSPORT_FABRICS_RDMA(func_name, args) case SPDK_NVME_TRANSPORT_RDMA: SPDK_UNREACHABLE(); +#define TRANSPORT_RDMA_AVAILABLE false +#endif +#define TRANSPORT_FABRICS_FC(func_name, args) case SPDK_NVME_TRANSPORT_FC: SPDK_UNREACHABLE(); +#define NVME_TRANSPORT_CALL(trtype, func_name, args) \ + do { \ + switch (trtype) { \ + TRANSPORT_PCIE(func_name, args) \ + TRANSPORT_FABRICS_RDMA(func_name, args) \ + TRANSPORT_FABRICS_FC(func_name, args) \ + TRANSPORT_DEFAULT(trtype) \ + } \ + SPDK_UNREACHABLE(); \ + } while (0) + +bool +spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype) +{ + switch (trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + return true; + + case SPDK_NVME_TRANSPORT_RDMA: + return TRANSPORT_RDMA_AVAILABLE; + + case SPDK_NVME_TRANSPORT_FC: + return false; + } + + return false; +} + +struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + NVME_TRANSPORT_CALL(trid->trtype, ctrlr_construct, (trid, opts, devhandle)); +} + +int +nvme_transport_ctrlr_scan(const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_remove_cb remove_cb, + bool direct_connect) +{ + NVME_TRANSPORT_CALL(trid->trtype, ctrlr_scan, (trid, cb_ctx, probe_cb, remove_cb, direct_connect)); +} + +int +nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_destruct, (ctrlr)); +} + +int +nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_enable, (ctrlr)); +} + +int +nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_set_reg_4, (ctrlr, offset, value)); +} + +int +nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_set_reg_8, (ctrlr, offset, value)); +} + +int +nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_reg_4, (ctrlr, offset, value)); +} + +int +nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_reg_8, (ctrlr, offset, value)); +} + +uint32_t +nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_max_xfer_size, (ctrlr)); +} + +uint16_t +nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_max_sges, (ctrlr)); +} + +void * +nvme_transport_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_alloc_cmb_io_buffer, (ctrlr, size)); +} + +int +nvme_transport_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_free_cmb_io_buffer, (ctrlr, buf, size)); +} + +struct spdk_nvme_qpair * +nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_create_io_qpair, (ctrlr, qid, opts)); +} + +int +nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_delete_io_qpair, (ctrlr, qpair)); +} + +int +nvme_transport_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_reinit_io_qpair, (ctrlr, qpair)); +} + +int +nvme_transport_qpair_enable(struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_enable, (qpair)); +} + +int +nvme_transport_qpair_disable(struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_disable, (qpair)); +} + +int +nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_reset, (qpair)); +} + +int +nvme_transport_qpair_fail(struct spdk_nvme_qpair *qpair) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_fail, (qpair)); +} + +int +nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_submit_request, (qpair, req)); +} + +int32_t +nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + NVME_TRANSPORT_CALL(qpair->trtype, qpair_process_completions, (qpair, max_completions)); +} diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c new file mode 100644 index 00000000..724cbc5c --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.c @@ -0,0 +1,214 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include "spdk/log.h" +#include "spdk/event.h" + +#include "nvme_uevent.h" + +#ifdef __linux__ + +#include + +#define SPDK_UEVENT_MSG_LEN 4096 + +int +spdk_uevent_connect(void) +{ + struct sockaddr_nl addr; + int netlink_fd; + int size = 64 * 1024; + int flag; + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = getpid(); + addr.nl_groups = 0xffffffff; + + netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT); + if (netlink_fd < 0) { + return -1; + } + + setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size)); + + flag = fcntl(netlink_fd, F_GETFL); + if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd, + spdk_strerror(errno)); + close(netlink_fd); + return -1; + } + + if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + close(netlink_fd); + return -1; + } + return netlink_fd; +} + +/* Note: We only parse the event from uio subsystem and will ignore + * all the event from other subsystem. the event from uio subsystem + * as below: + * action: "add" or "remove" + * subsystem: "uio" + * dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0" + */ +static int +parse_event(const char *buf, struct spdk_uevent *event) +{ + char action[SPDK_UEVENT_MSG_LEN]; + char subsystem[SPDK_UEVENT_MSG_LEN]; + char dev_path[SPDK_UEVENT_MSG_LEN]; + char driver[SPDK_UEVENT_MSG_LEN]; + char vfio_pci_addr[SPDK_UEVENT_MSG_LEN]; + + memset(action, 0, SPDK_UEVENT_MSG_LEN); + memset(subsystem, 0, SPDK_UEVENT_MSG_LEN); + memset(dev_path, 0, SPDK_UEVENT_MSG_LEN); + memset(driver, 0, SPDK_UEVENT_MSG_LEN); + memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN); + + while (*buf) { + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + snprintf(action, sizeof(action), "%s", buf); + } else if (!strncmp(buf, "DEVPATH=", 8)) { + buf += 8; + snprintf(dev_path, sizeof(dev_path), "%s", buf); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + snprintf(subsystem, sizeof(subsystem), "%s", buf); + } else if (!strncmp(buf, "DRIVER=", 7)) { + buf += 7; + snprintf(driver, sizeof(driver), "%s", buf); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf); + } + while (*buf++) + ; + } + + if (!strncmp(subsystem, "uio", 3)) { + char *pci_address, *tmp; + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO; + if (!strncmp(action, "add", 3)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + tmp = strstr(dev_path, "/uio/"); + + memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path)); + + pci_address = strrchr(dev_path, '/'); + pci_address++; + if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + } + if (!strncmp(driver, "vfio-pci", 8)) { + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO; + if (!strncmp(action, "add", 3)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + + } + return -1; +} + +int +spdk_get_uevent(int fd, struct spdk_uevent *uevent) +{ + int ret; + char buf[SPDK_UEVENT_MSG_LEN]; + + memset(uevent, 0, sizeof(struct spdk_uevent)); + memset(buf, 0, SPDK_UEVENT_MSG_LEN); + + ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT); + if (ret > 0) { + return parse_event(buf, uevent); + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno)); + return -1; + } + } + + /* connection closed */ + if (ret == 0) { + return -1; + } + return 0; +} + +#else /* Not Linux */ + +int +spdk_uevent_connect(void) +{ + return -1; +} + +int +spdk_get_uevent(int fd, struct spdk_uevent *uevent) +{ + return -1; +} +#endif diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h new file mode 100644 index 00000000..7fe0ab7a --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.h @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * SPDK uevent + */ + +#include "spdk/env.h" +#include "spdk/nvmf_spec.h" + +#ifndef SPDK_UEVENT_H_ +#define SPDK_UEVENT_H_ + +#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1 +#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2 + +enum spdk_nvme_uevent_action { + SPDK_NVME_UEVENT_ADD = 0, + SPDK_NVME_UEVENT_REMOVE = 1, +}; + +struct spdk_uevent { + enum spdk_nvme_uevent_action action; + int subsystem; + char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1]; +}; + +int spdk_uevent_connect(void); +int spdk_get_uevent(int fd, struct spdk_uevent *uevent); + +#endif /* SPDK_UEVENT_H_ */ diff --git a/src/spdk/lib/nvmf/Makefile b/src/spdk/lib/nvmf/Makefile new file mode 100644 index 00000000..8f299a90 --- /dev/null +++ b/src/spdk/lib/nvmf/Makefile @@ -0,0 +1,63 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \ + subsystem.c nvmf.c \ + request.c transport.c + +C_SRCS-$(CONFIG_RDMA) += rdma.c +LIBNAME = nvmf +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvmf/ctrlr.c b/src/spdk/lib/nvmf/ctrlr.c new file mode 100644 index 00000000..ed5e68f0 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr.c @@ -0,0 +1,1773 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/bit_array.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/nvme_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/version.h" + +#include "spdk_internal/log.h" + +#define MIN_KEEP_ALIVE_TIMEOUT 10000 + +#define MODEL_NUMBER "SPDK bdev Controller" + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static inline void +spdk_nvmf_invalid_connect_response(struct spdk_nvmf_fabric_connect_rsp *rsp, + uint8_t iattr, uint16_t ipo) +{ + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + rsp->status_code_specific.invalid.iattr = iattr; + rsp->status_code_specific.invalid.ipo = ipo; +} + +#define SPDK_NVMF_INVALID_CONNECT_CMD(rsp, field) \ + spdk_nvmf_invalid_connect_response(rsp, 0, offsetof(struct spdk_nvmf_fabric_connect_cmd, field)) +#define SPDK_NVMF_INVALID_CONNECT_DATA(rsp, field) \ + spdk_nvmf_invalid_connect_response(rsp, 1, offsetof(struct spdk_nvmf_fabric_connect_data, field)) + +static void +ctrlr_add_qpair_and_update_rsp(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_fabric_connect_rsp *rsp) +{ + assert(ctrlr->admin_qpair->group->thread == spdk_get_thread()); + + /* check if we would exceed ctrlr connection limit */ + if (qpair->qid >= spdk_bit_array_capacity(ctrlr->qpair_mask)) { + SPDK_ERRLOG("Requested QID %u but Max QID is %u\n", + qpair->qid, spdk_bit_array_capacity(ctrlr->qpair_mask) - 1); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) { + SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + qpair->ctrlr = ctrlr; + spdk_bit_array_set(ctrlr->qpair_mask, qpair->qid); + + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + rsp->status_code_specific.success.cntlid = ctrlr->cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "connect capsule response: cntlid = 0x%04x\n", + rsp->status_code_specific.success.cntlid); +} + +static void +_spdk_nvmf_request_complete(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + + spdk_nvmf_request_complete(req); +} + +static void +_spdk_nvmf_ctrlr_add_admin_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + ctrlr->admin_qpair = qpair; + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); + spdk_nvmf_request_complete(req); +} + +static void +_spdk_nvmf_subsystem_add_ctrlr(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (spdk_nvmf_subsystem_add_ctrlr(ctrlr->subsys, ctrlr)) { + SPDK_ERRLOG("Unable to add controller to subsystem\n"); + free(ctrlr); + qpair->ctrlr = NULL; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); + return; + } + + spdk_thread_send_msg(ctrlr->thread, _spdk_nvmf_ctrlr_add_admin_qpair, req); +} + +static struct spdk_nvmf_ctrlr * +spdk_nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_request *req, + struct spdk_nvmf_fabric_connect_cmd *connect_cmd, + struct spdk_nvmf_fabric_connect_data *connect_data) +{ + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_transport *transport; + + ctrlr = calloc(1, sizeof(*ctrlr)); + if (ctrlr == NULL) { + SPDK_ERRLOG("Memory allocation failed\n"); + return NULL; + } + + req->qpair->ctrlr = ctrlr; + ctrlr->subsys = subsystem; + ctrlr->thread = req->qpair->group->thread; + + transport = req->qpair->transport; + ctrlr->qpair_mask = spdk_bit_array_create(transport->opts.max_qpairs_per_ctrlr); + if (!ctrlr->qpair_mask) { + SPDK_ERRLOG("Failed to allocate controller qpair mask\n"); + free(ctrlr); + return NULL; + } + + ctrlr->feat.keep_alive_timer.bits.kato = connect_cmd->kato; + ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1; + ctrlr->feat.volatile_write_cache.bits.wce = 1; + + /* Subtract 1 for admin queue, 1 for 0's based */ + ctrlr->feat.number_of_queues.bits.ncqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + ctrlr->feat.number_of_queues.bits.nsqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + + memcpy(ctrlr->hostid, connect_data->hostid, sizeof(ctrlr->hostid)); + + ctrlr->vcprop.cap.raw = 0; + ctrlr->vcprop.cap.bits.cqr = 1; /* NVMe-oF specification required */ + ctrlr->vcprop.cap.bits.mqes = transport->opts.max_queue_depth - + 1; /* max queue depth */ + ctrlr->vcprop.cap.bits.ams = 0; /* optional arb mechanisms */ + ctrlr->vcprop.cap.bits.to = 1; /* ready timeout - 500 msec units */ + ctrlr->vcprop.cap.bits.dstrd = 0; /* fixed to 0 for NVMe-oF */ + ctrlr->vcprop.cap.bits.css = SPDK_NVME_CAP_CSS_NVM; /* NVM command set */ + ctrlr->vcprop.cap.bits.mpsmin = 0; /* 2 ^ (12 + mpsmin) == 4k */ + ctrlr->vcprop.cap.bits.mpsmax = 0; /* 2 ^ (12 + mpsmax) == 4k */ + + /* Version Supported: 1.3 */ + ctrlr->vcprop.vs.bits.mjr = 1; + ctrlr->vcprop.vs.bits.mnr = 3; + ctrlr->vcprop.vs.bits.ter = 0; + + ctrlr->vcprop.cc.raw = 0; + ctrlr->vcprop.cc.bits.en = 0; /* Init controller disabled */ + + ctrlr->vcprop.csts.raw = 0; + ctrlr->vcprop.csts.bits.rdy = 0; /* Init controller as not ready */ + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cap 0x%" PRIx64 "\n", ctrlr->vcprop.cap.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "vs 0x%x\n", ctrlr->vcprop.vs.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cc 0x%x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "csts 0x%x\n", ctrlr->vcprop.csts.raw); + + spdk_thread_send_msg(subsystem->thread, _spdk_nvmf_subsystem_add_ctrlr, req); + + return ctrlr; +} + +void +spdk_nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr) +{ + spdk_nvmf_subsystem_remove_ctrlr(ctrlr->subsys, ctrlr); + + free(ctrlr); +} + +static void +spdk_nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + /* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect. + * For error case, the value should be NULL. So set it to NULL at first. + */ + qpair->ctrlr = NULL; + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_ERRLOG("I/O connect not allowed on discovery controller\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (!ctrlr->vcprop.cc.bits.en) { + SPDK_ERRLOG("Got I/O connect before ctrlr was enabled\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iosqes != sizeof(struct spdk_nvme_cmd)) { + SPDK_ERRLOG("Got I/O connect with invalid IOSQES %u\n", + ctrlr->vcprop.cc.bits.iosqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iocqes != sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("Got I/O connect with invalid IOCQES %u\n", + ctrlr->vcprop.cc.bits.iocqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); + +end: + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); +} + +static void +_spdk_nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; + struct spdk_nvmf_subsystem *subsystem; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid); + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn); + /* We already checked this in spdk_nvmf_ctrlr_connect */ + assert(subsystem != NULL); + + ctrlr = spdk_nvmf_subsystem_get_ctrlr(subsystem, data->cntlid); + if (ctrlr == NULL) { + SPDK_ERRLOG("Unknown controller ID 0x%x\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); + return; + } + + admin_qpair = ctrlr->admin_qpair; + qpair->ctrlr = ctrlr; + spdk_thread_send_msg(admin_qpair->group->thread, spdk_nvmf_ctrlr_add_io_qpair, req); +} + +static int +spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_fabric_connect_cmd *cmd = &req->cmd->connect_cmd; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_subsystem *subsystem; + const char *subnqn, *hostnqn; + struct spdk_nvme_transport_id listen_trid = {}; + void *end; + + if (req->length < sizeof(struct spdk_nvmf_fabric_connect_data)) { + SPDK_ERRLOG("Connect command data length 0x%x too small\n", req->length); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "recfmt 0x%x qid %u sqsize %u\n", + cmd->recfmt, cmd->qid, cmd->sqsize); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect data:\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " cntlid: 0x%04x\n", data->cntlid); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostid: %08x-%04x-%04x-%02x%02x-%04x%08x ***\n", + ntohl(*(uint32_t *)&data->hostid[0]), + ntohs(*(uint16_t *)&data->hostid[4]), + ntohs(*(uint16_t *)&data->hostid[6]), + data->hostid[8], + data->hostid[9], + ntohs(*(uint16_t *)&data->hostid[10]), + ntohl(*(uint32_t *)&data->hostid[12])); + + if (cmd->recfmt != 0) { + SPDK_ERRLOG("Connect command unsupported RECFMT %u\n", cmd->recfmt); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Ensure that subnqn is null terminated */ + end = memchr(data->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Connect SUBNQN is not null terminated\n"); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + subnqn = data->subnqn; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " subnqn: \"%s\"\n", subnqn); + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, subnqn); + if (subsystem == NULL) { + SPDK_ERRLOG("Could not find subsystem '%s'\n", subnqn); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Ensure that hostnqn is null terminated */ + end = memchr(data->hostnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Connect HOSTNQN is not null terminated\n"); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, hostnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + hostnqn = data->hostnqn; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostnqn: \"%s\"\n", hostnqn); + + if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s'\n", subnqn, hostnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_nvmf_qpair_get_listen_trid(qpair, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' is unable to enforce access control due to an internal error.\n", + subnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (!spdk_nvmf_subsystem_listener_allowed(subsystem, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s' to connect at this address.\n", subnqn, + hostnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* + * SQSIZE is a 0-based value, so it must be at least 1 (minimum queue depth is 2) and + * strictly less than max_queue_depth. + */ + if (cmd->sqsize == 0 || cmd->sqsize >= qpair->transport->opts.max_queue_depth) { + SPDK_ERRLOG("Invalid SQSIZE %u (min 1, max %u)\n", + cmd->sqsize, qpair->transport->opts.max_queue_depth - 1); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + qpair->sq_head_max = cmd->sqsize; + qpair->qid = cmd->qid; + + if (cmd->qid == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect Admin Queue for controller ID 0x%x\n", data->cntlid); + + if (data->cntlid != 0xFFFF) { + /* This NVMf target only supports dynamic mode. */ + SPDK_ERRLOG("The NVMf target only supports dynamic mode (CNTLID = 0x%x).\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Establish a new ctrlr */ + ctrlr = spdk_nvmf_ctrlr_create(subsystem, req, cmd, data); + if (!ctrlr) { + SPDK_ERRLOG("spdk_nvmf_ctrlr_create() failed\n"); + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else { + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + } else { + spdk_thread_send_msg(subsystem->thread, _spdk_nvmf_ctrlr_add_io_qpair, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } +} + +static uint64_t +nvmf_prop_get_cap(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cap.raw; +} + +static uint64_t +nvmf_prop_get_vs(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.vs.raw; +} + +static uint64_t +nvmf_prop_get_cc(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cc.raw; +} + +static bool +nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint64_t value) +{ + union spdk_nvme_cc_register cc, diff; + + cc.raw = (uint32_t)value; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cur CC: 0x%08x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "new CC: 0x%08x\n", cc.raw); + + /* + * Calculate which bits changed between the current and new CC. + * Mark each bit as 0 once it is handled to determine if any unhandled bits were changed. + */ + diff.raw = cc.raw ^ ctrlr->vcprop.cc.raw; + + if (diff.bits.en) { + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n"); + ctrlr->vcprop.cc.bits.en = 1; + ctrlr->vcprop.csts.bits.rdy = 1; + } else { + SPDK_ERRLOG("CC.EN transition from 1 to 0 (reset) not implemented!\n"); + + } + diff.bits.en = 0; + } + + if (diff.bits.shn) { + if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || + cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + ctrlr->vcprop.cc.bits.shn = cc.bits.shn; + ctrlr->vcprop.cc.bits.en = 0; + ctrlr->vcprop.csts.bits.rdy = 0; + ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE; + } else if (cc.bits.shn == 0) { + ctrlr->vcprop.cc.bits.shn = 0; + } else { + SPDK_ERRLOG("Prop Set CC: Invalid SHN value %u%ub\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + return false; + } + diff.bits.shn = 0; + } + + if (diff.bits.iosqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOSQES = %u (%u bytes)\n", + cc.bits.iosqes, 1u << cc.bits.iosqes); + ctrlr->vcprop.cc.bits.iosqes = cc.bits.iosqes; + diff.bits.iosqes = 0; + } + + if (diff.bits.iocqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOCQES = %u (%u bytes)\n", + cc.bits.iocqes, 1u << cc.bits.iocqes); + ctrlr->vcprop.cc.bits.iocqes = cc.bits.iocqes; + diff.bits.iocqes = 0; + } + + if (diff.raw != 0) { + SPDK_ERRLOG("Prop Set CC toggled reserved bits 0x%x!\n", diff.raw); + return false; + } + + return true; +} + +static uint64_t +nvmf_prop_get_csts(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.csts.raw; +} + +struct nvmf_prop { + uint32_t ofst; + uint8_t size; + char name[11]; + uint64_t (*get_cb)(struct spdk_nvmf_ctrlr *ctrlr); + bool (*set_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint64_t value); +}; + +#define PROP(field, size, get_cb, set_cb) \ + { \ + offsetof(struct spdk_nvme_registers, field), \ + SPDK_NVMF_PROP_SIZE_##size, \ + #field, \ + get_cb, set_cb \ + } + +static const struct nvmf_prop nvmf_props[] = { + PROP(cap, 8, nvmf_prop_get_cap, NULL), + PROP(vs, 4, nvmf_prop_get_vs, NULL), + PROP(cc, 4, nvmf_prop_get_cc, nvmf_prop_set_cc), + PROP(csts, 4, nvmf_prop_get_csts, NULL), +}; + +static const struct nvmf_prop * +find_prop(uint32_t ofst) +{ + size_t i; + + for (i = 0; i < SPDK_COUNTOF(nvmf_props); i++) { + const struct nvmf_prop *prop = &nvmf_props[i]; + + if (prop->ofst == ofst) { + return prop; + } + } + + return NULL; +} + +static int +spdk_nvmf_property_get(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_get_cmd *cmd = &req->cmd->prop_get_cmd; + struct spdk_nvmf_fabric_prop_get_rsp *response = &req->rsp->prop_get_rsp; + const struct nvmf_prop *prop; + + response->status.sc = 0; + response->value.u64 = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x\n", + cmd->attrib.size, cmd->ofst); + + if (cmd->attrib.size != SPDK_NVMF_PROP_SIZE_4 && + cmd->attrib.size != SPDK_NVMF_PROP_SIZE_8) { + SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + prop = find_prop(cmd->ofst); + if (prop == NULL || prop->get_cb == NULL) { + /* Reserved properties return 0 when read */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + if (cmd->attrib.size != prop->size) { + SPDK_ERRLOG("offset 0x%x size mismatch: cmd %u, prop %u\n", + cmd->ofst, cmd->attrib.size, prop->size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + response->value.u64 = prop->get_cb(ctrlr); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "response value: 0x%" PRIx64 "\n", response->value.u64); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_property_set(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_set_cmd *cmd = &req->cmd->prop_set_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + const struct nvmf_prop *prop; + uint64_t value; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x, value 0x%" PRIx64 "\n", + cmd->attrib.size, cmd->ofst, cmd->value.u64); + + prop = find_prop(cmd->ofst); + if (prop == NULL || prop->set_cb == NULL) { + SPDK_ERRLOG("Invalid offset 0x%x\n", cmd->ofst); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + if (cmd->attrib.size != prop->size) { + SPDK_ERRLOG("offset 0x%x size mismatch: cmd %u, prop %u\n", + cmd->ofst, cmd->attrib.size, prop->size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + value = cmd->value.u64; + if (prop->size == SPDK_NVMF_PROP_SIZE_4) { + value = (uint32_t)value; + } + + if (!prop->set_cb(ctrlr, value)) { + SPDK_ERRLOG("prop set_cb failed\n"); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_arbitration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Arbitration (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.arbitration.raw = cmd->cdw11; + ctrlr->feat.arbitration.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_power_management(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_power_management opts; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Power Management (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + /* Only PS = 0 is allowed, since we report NPSS = 0 */ + if (opts.bits.ps != 0) { + SPDK_ERRLOG("Invalid power state %u\n", opts.bits.ps); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.power_management.raw = cmd->cdw11; + ctrlr->feat.power_management.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static bool +temp_threshold_opts_valid(const union spdk_nvme_feat_temperature_threshold *opts) +{ + /* + * Valid TMPSEL values: + * 0000b - 1000b: temperature sensors + * 1111b: set all implemented temperature sensors + */ + if (opts->bits.tmpsel >= 9 && opts->bits.tmpsel != 15) { + /* 1001b - 1110b: reserved */ + SPDK_ERRLOG("Invalid TMPSEL %u\n", opts->bits.tmpsel); + return false; + } + + /* + * Valid THSEL values: + * 00b: over temperature threshold + * 01b: under temperature threshold + */ + if (opts->bits.thsel > 1) { + /* 10b - 11b: reserved */ + SPDK_ERRLOG("Invalid THSEL %u\n", opts->bits.thsel); + return false; + } + + return true; +} + +static int +spdk_nvmf_ctrlr_set_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_temperature_threshold opts; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (!temp_threshold_opts_valid(&opts)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - ignore new values */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_temperature_threshold opts; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (!temp_threshold_opts_valid(&opts)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - return 0 for all thresholds */ + rsp->cdw0 = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_error_recovery(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_error_recovery opts; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Error Recovery (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (opts.bits.dulbe) { + /* + * Host is not allowed to set this bit, since we don't advertise it in + * Identify Namespace. + */ + SPDK_ERRLOG("Host set unsupported DULBE bit\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.error_recovery.raw = cmd->cdw11; + ctrlr->feat.error_recovery.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_volatile_write_cache(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.volatile_write_cache.raw = cmd->cdw11; + ctrlr->feat.volatile_write_cache.bits.reserved = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache %s\n", + ctrlr->feat.volatile_write_cache.bits.wce ? "Enabled" : "Disabled"); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_write_atomicity(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Write Atomicity (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.write_atomicity.raw = cmd->cdw11; + ctrlr->feat.write_atomicity.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + SPDK_ERRLOG("Set Features - Host Identifier not allowed\n"); + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + union spdk_nvme_feat_host_identifier opts; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Host Identifier\n"); + + opts.raw = cmd->cdw11; + if (!opts.bits.exhid) { + /* NVMe over Fabrics requires EXHID=1 (128-bit/16-byte host ID) */ + SPDK_ERRLOG("Get Features - Host Identifier with EXHID=0 not allowed\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data == NULL || req->length < sizeof(ctrlr->hostid)) { + SPDK_ERRLOG("Invalid data buffer for Get Features - Host Identifier\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + memcpy(req->data, ctrlr->hostid, sizeof(ctrlr->hostid)); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_keep_alive_timer(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer (%u ms)\n", cmd->cdw11); + + if (cmd->cdw11 == 0) { + rsp->status.sc = SPDK_NVME_SC_KEEP_ALIVE_INVALID; + } else if (cmd->cdw11 < MIN_KEEP_ALIVE_TIMEOUT) { + ctrlr->feat.keep_alive_timer.bits.kato = MIN_KEEP_ALIVE_TIMEOUT; + } else { + ctrlr->feat.keep_alive_timer.bits.kato = cmd->cdw11; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer set to %u ms\n", + ctrlr->feat.keep_alive_timer.bits.kato); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_number_of_queues(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint32_t count; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Number of Queues, cdw11 0x%x\n", + req->cmd->nvme_cmd.cdw11); + + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + /* verify that the controller is ready to process commands */ + if (count > 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Queue pairs already active!\n"); + rsp->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + } else { + /* + * Ignore the value requested by the host - + * always return the pre-configured value based on max_qpairs_allowed. + */ + rsp->cdw0 = ctrlr->feat.number_of_queues.raw; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_async_event_configuration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Async Event Configuration, cdw11 0x%08x\n", + cmd->cdw11); + ctrlr->feat.async_event_configuration.raw = cmd->cdw11; + ctrlr->feat.async_event_configuration.bits.reserved = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n"); + + /* Only one asynchronous event is supported for now */ + if (ctrlr->aer_req != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n"); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + rsp->cdw0 = ctrlr->notice_event.raw; + ctrlr->notice_event.raw = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->aer_req = req; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static void +spdk_nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length) +{ + struct spdk_nvme_firmware_page fw_page; + size_t copy_len; + + memset(&fw_page, 0, sizeof(fw_page)); + fw_page.afi.active_slot = 1; + fw_page.afi.next_reset_slot = 0; + spdk_strcpy_pad(fw_page.revision[0], FW_VERSION, sizeof(fw_page.revision[0]), ' '); + + if (offset < sizeof(fw_page)) { + copy_len = spdk_min(sizeof(fw_page) - offset, length); + if (copy_len > 0) { + memcpy(buffer, (const char *)&fw_page + offset, copy_len); + } + } +} + +void +spdk_nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid) +{ + uint16_t max_changes = SPDK_COUNTOF(ctrlr->changed_ns_list.ns_list); + uint16_t i; + bool found = false; + + for (i = 0; i < ctrlr->changed_ns_list_count; i++) { + if (ctrlr->changed_ns_list.ns_list[i] == nsid) { + /* nsid is already in the list */ + found = true; + break; + } + } + + if (!found) { + if (ctrlr->changed_ns_list_count == max_changes) { + /* Out of space - set first entry to FFFFFFFFh and zero-fill the rest. */ + ctrlr->changed_ns_list.ns_list[0] = 0xFFFFFFFFu; + for (i = 1; i < max_changes; i++) { + ctrlr->changed_ns_list.ns_list[i] = 0; + } + } else { + ctrlr->changed_ns_list.ns_list[ctrlr->changed_ns_list_count++] = nsid; + } + } + + spdk_nvmf_ctrlr_async_event_ns_notice(ctrlr); +} + +static void +spdk_nvmf_get_changed_ns_list_log_page(struct spdk_nvmf_ctrlr *ctrlr, + void *buffer, uint64_t offset, uint32_t length) +{ + size_t copy_length; + + if (offset < sizeof(ctrlr->changed_ns_list)) { + copy_length = spdk_min(length, sizeof(ctrlr->changed_ns_list) - offset); + if (copy_length) { + memcpy(buffer, (char *)&ctrlr->changed_ns_list + offset, copy_length); + } + } + + /* Clear log page each time it is read */ + ctrlr->changed_ns_list_count = 0; + memset(&ctrlr->changed_ns_list, 0, sizeof(ctrlr->changed_ns_list)); +} + +/* The structure can be modified if we provide support for other commands in future */ +static const struct spdk_nvme_cmds_and_effect_log_page g_cmds_and_effect_log_page = { + .admin_cmds_supported = { + /* CSUPP, LBCC, NCC, NIC, CCC, CSE */ + /* Get Log Page */ + [SPDK_NVME_OPC_GET_LOG_PAGE] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Identify */ + [SPDK_NVME_OPC_IDENTIFY] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Abort */ + [SPDK_NVME_OPC_ABORT] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Set Features */ + [SPDK_NVME_OPC_SET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Get Features */ + [SPDK_NVME_OPC_GET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Async Event Request */ + [SPDK_NVME_OPC_ASYNC_EVENT_REQUEST] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Keep Alive */ + [SPDK_NVME_OPC_KEEP_ALIVE] = {1, 0, 0, 0, 0, 0, 0, 0}, + }, + .io_cmds_supported = { + /* FLUSH */ + [SPDK_NVME_OPC_FLUSH] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* WRITE */ + [SPDK_NVME_OPC_WRITE] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* READ */ + [SPDK_NVME_OPC_READ] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* WRITE ZEROES */ + [SPDK_NVME_OPC_WRITE_ZEROES] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* DATASET MANAGEMENT */ + [SPDK_NVME_OPC_DATASET_MANAGEMENT] = {1, 1, 0, 0, 0, 0, 0, 0}, + }, +}; + +static void +spdk_nvmf_get_cmds_and_effects_log_page(void *buffer, + uint64_t offset, uint32_t length) +{ + uint32_t page_size = sizeof(struct spdk_nvme_cmds_and_effect_log_page); + size_t copy_len = 0; + size_t zero_len = length; + + if (offset < page_size) { + copy_len = spdk_min(page_size - offset, length); + zero_len -= copy_len; + memcpy(buffer, (char *)(&g_cmds_and_effect_log_page) + offset, copy_len); + } + + if (zero_len) { + memset((char *)buffer + copy_len, 0, zero_len); + } +} + +static int +spdk_nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + uint64_t offset, len; + uint32_t numdl, numdu; + uint8_t lid; + + if (req->data == NULL) { + SPDK_ERRLOG("get log command with no buffer\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + offset = (uint64_t)cmd->cdw12 | ((uint64_t)cmd->cdw13 << 32); + if (offset & 3) { + SPDK_ERRLOG("Invalid log page offset 0x%" PRIx64 "\n", offset); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + numdl = (cmd->cdw10 >> 16) & 0xFFFFu; + numdu = (cmd->cdw11) & 0xFFFFu; + len = ((numdu << 16) + numdl + (uint64_t)1) * 4; + if (len > req->length) { + SPDK_ERRLOG("Get log page: len (%" PRIu64 ") > buf size (%u)\n", + len, req->length); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + lid = cmd->cdw10 & 0xFF; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get log page: LID=0x%02X offset=0x%" PRIx64 " len=0x%" PRIx64 "\n", + lid, offset, len); + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + switch (lid) { + case SPDK_NVME_LOG_DISCOVERY: + spdk_nvmf_get_discovery_log_page(subsystem->tgt, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } else { + switch (lid) { + case SPDK_NVME_LOG_ERROR: + case SPDK_NVME_LOG_HEALTH_INFORMATION: + /* TODO: actually fill out log page data */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_FIRMWARE_SLOT: + spdk_nvmf_get_firmware_slot_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG: + spdk_nvmf_get_cmds_and_effects_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_CHANGED_NS_LIST: + spdk_nvmf_get_changed_ns_list_log_page(ctrlr, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } + +invalid_log_page: + SPDK_ERRLOG("Unsupported Get Log Page 0x%02X\n", lid); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_data *nsdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_ns *ns; + uint32_t max_num_blocks; + + if (cmd->nsid == 0 || cmd->nsid > subsystem->max_nsid) { + SPDK_ERRLOG("Identify Namespace for invalid NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _spdk_nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + /* + * Inactive namespaces should return a zero filled data structure. + * The data buffer is already zeroed by spdk_nvmf_ctrlr_process_admin_cmd(), + * so we can just return early here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Identify Namespace for inactive NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + spdk_nvmf_bdev_ctrlr_identify_ns(ns, nsdata); + + /* Due to bug in the Linux kernel NVMe driver we have to set noiob no larger than mdts */ + max_num_blocks = ctrlr->admin_qpair->transport->opts.max_io_size / + (1U << nsdata->lbaf[nsdata->flbas.format].lbads); + if (nsdata->noiob > max_num_blocks) { + nsdata->noiob = max_num_blocks; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_ctrlr_data *cdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_transport *transport = ctrlr->admin_qpair->transport; + + /* + * Common fields for discovery and NVM subsystems + */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + assert((transport->opts.max_io_size % 4096) == 0); + cdata->mdts = spdk_u32log2(transport->opts.max_io_size / 4096); + cdata->cntlid = ctrlr->cntlid; + cdata->ver = ctrlr->vcprop.vs; + cdata->lpa.edlp = 1; + cdata->elpe = 127; + cdata->maxcmd = transport->opts.max_queue_depth; + cdata->sgls.supported = 1; + cdata->sgls.keyed_sgl = 1; + cdata->sgls.sgl_offset = 1; + spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0'); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "sgls data: 0x%x\n", from_le32(&cdata->sgls)); + + /* + * NVM subsystem fields (reserved for discovery subsystems) + */ + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_NVME) { + spdk_strcpy_pad(cdata->mn, MODEL_NUMBER, sizeof(cdata->mn), ' '); + spdk_strcpy_pad(cdata->sn, spdk_nvmf_subsystem_get_sn(subsystem), sizeof(cdata->sn), ' '); + cdata->kas = 10; + + cdata->rab = 6; + cdata->cmic.multi_port = 1; + cdata->cmic.multi_host = 1; + cdata->oaes.ns_attribute_notices = 1; + cdata->ctratt.host_id_exhid_supported = 1; + cdata->aerl = 0; + cdata->frmw.slot1_ro = 1; + cdata->frmw.num_slots = 1; + + cdata->lpa.celp = 1; /* Command Effects log page supported */ + + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->nn = subsystem->max_nsid; + cdata->vwc.present = 1; + cdata->vwc.flush_broadcast = SPDK_NVME_FLUSH_BROADCAST_NOT_SUPPORTED; + + cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16; + cdata->nvmf_specific.iorcsz = sizeof(struct spdk_nvme_cpl) / 16; + cdata->nvmf_specific.icdoff = 0; /* offset starts directly after SQE */ + cdata->nvmf_specific.ctrattr.ctrlr_model = SPDK_NVMF_CTRLR_MODEL_DYNAMIC; + cdata->nvmf_specific.msdbd = 1; /* target supports single SGL in capsule */ + + /* TODO: this should be set by the transport */ + cdata->nvmf_specific.ioccsz += transport->opts.in_capsule_data_size / 16; + + cdata->oncs.dsm = spdk_nvmf_ctrlr_dsm_supported(ctrlr); + cdata->oncs.write_zeroes = spdk_nvmf_ctrlr_write_zeroes_supported(ctrlr); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ioccsz 0x%x\n", + cdata->nvmf_specific.ioccsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: iorcsz 0x%x\n", + cdata->nvmf_specific.iorcsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: icdoff 0x%x\n", + cdata->nvmf_specific.icdoff); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ctrattr 0x%x\n", + *(uint8_t *)&cdata->nvmf_specific.ctrattr); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: msdbd 0x%x\n", + cdata->nvmf_specific.msdbd); + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_active_ns_list(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_list *ns_list) +{ + struct spdk_nvmf_ns *ns; + uint32_t count = 0; + + if (cmd->nsid >= 0xfffffffeUL) { + SPDK_ERRLOG("Identify Active Namespace List with invalid NSID %u\n", cmd->nsid); + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->opts.nsid <= cmd->nsid) { + continue; + } + + ns_list->ns_list[count++] = ns->opts.nsid; + if (count == SPDK_COUNTOF(ns_list->ns_list)) { + break; + } + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static void +_add_ns_id_desc(void **buf_ptr, size_t *buf_remain, + enum spdk_nvme_nidt type, + const void *data, size_t data_size) +{ + struct spdk_nvme_ns_id_desc *desc; + size_t desc_size = sizeof(*desc) + data_size; + + /* + * These should never fail in practice, since all valid NS ID descriptors + * should be defined so that they fit in the available 4096-byte buffer. + */ + assert(data_size > 0); + assert(data_size <= UINT8_MAX); + assert(desc_size < *buf_remain); + if (data_size == 0 || data_size > UINT8_MAX || desc_size > *buf_remain) { + return; + } + + desc = *buf_ptr; + desc->nidt = type; + desc->nidl = data_size; + memcpy(desc->nid, data, data_size); + + *buf_ptr += desc_size; + *buf_remain -= desc_size; +} + +static int +spdk_nvmf_ctrlr_identify_ns_id_descriptor_list( + struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + void *id_desc_list, size_t id_desc_list_size) +{ + struct spdk_nvmf_ns *ns; + size_t buf_remain = id_desc_list_size; + void *buf_ptr = id_desc_list; + + ns = _spdk_nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + +#define ADD_ID_DESC(type, data, size) \ + do { \ + if (!spdk_mem_all_zero(data, size)) { \ + _add_ns_id_desc(&buf_ptr, &buf_remain, type, data, size); \ + } \ + } while (0) + + ADD_ID_DESC(SPDK_NVME_NIDT_EUI64, ns->opts.eui64, sizeof(ns->opts.eui64)); + ADD_ID_DESC(SPDK_NVME_NIDT_NGUID, ns->opts.nguid, sizeof(ns->opts.nguid)); + ADD_ID_DESC(SPDK_NVME_NIDT_UUID, &ns->opts.uuid, sizeof(ns->opts.uuid)); + + /* + * The list is automatically 0-terminated because controller to host buffers in + * admin commands always get zeroed in spdk_nvmf_ctrlr_process_admin_cmd(). + */ + +#undef ADD_ID_DESC + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify(struct spdk_nvmf_request *req) +{ + uint8_t cns; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + + if (req->data == NULL || req->length < 4096) { + SPDK_ERRLOG("identify command with invalid buffer\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + cns = cmd->cdw10 & 0xFF; + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY && + cns != SPDK_NVME_IDENTIFY_CTRLR) { + /* Discovery controllers only support Identify Controller */ + goto invalid_cns; + } + + switch (cns) { + case SPDK_NVME_IDENTIFY_NS: + return spdk_nvmf_ctrlr_identify_ns(ctrlr, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_CTRLR: + return spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, req->data); + case SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST: + return spdk_nvmf_ctrlr_identify_active_ns_list(subsystem, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST: + return spdk_nvmf_ctrlr_identify_ns_id_descriptor_list(subsystem, cmd, rsp, req->data, req->length); + default: + goto invalid_cns; + } + +invalid_cns: + SPDK_ERRLOG("Identify command with unsupported CNS 0x%02x\n", cns); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + + +static struct spdk_nvmf_request * +spdk_nvmf_qpair_abort(struct spdk_nvmf_qpair *qpair, uint16_t cid) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvmf_request *req; + + if (spdk_nvmf_qpair_is_admin_queue(qpair)) { + if (ctrlr->aer_req && ctrlr->aer_req->cmd->nvme_cmd.cid == cid) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Aborting AER request\n"); + req = ctrlr->aer_req; + ctrlr->aer_req = NULL; + return req; + } + } + + /* TODO: track list of outstanding requests in qpair? */ + return NULL; +} + +static void +spdk_nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + + spdk_nvmf_request_complete(req); +} + +static void +spdk_nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + uint16_t sqid = cmd->cdw10 & 0xFFFFu; + struct spdk_nvmf_qpair *qpair; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) { + struct spdk_nvmf_request *req_to_abort; + uint16_t cid = cmd->cdw10 >> 16; + + /* Found the qpair */ + + req_to_abort = spdk_nvmf_qpair_abort(qpair, cid); + if (req_to_abort == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cid %u not found\n", cid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_for_each_channel_continue(i, -EINVAL); + return; + } + + /* Complete the request with aborted status */ + req_to_abort->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req_to_abort->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + spdk_nvmf_request_complete(req_to_abort); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p req=%p sqid=%u cid=%u successful\n", + qpair->ctrlr, req_to_abort, sqid, cid); + rsp->cdw0 = 0; /* Command successfully aborted */ + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + /* Return -1 for the status so the iteration across threads stops. */ + spdk_for_each_channel_continue(i, -1); + + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static int +spdk_nvmf_ctrlr_abort(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = 1; /* Command not aborted */ + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + + /* Send a message to each poll group, searching for this ctrlr, sqid, and command. */ + spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt, + spdk_nvmf_ctrlr_abort_on_pg, + req, + spdk_nvmf_ctrlr_abort_done + ); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = cdw0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features(struct spdk_nvmf_request *req) +{ + uint8_t feature; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + feature = cmd->cdw10 & 0xff; /* mask out the FID value */ + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return get_features_generic(req, ctrlr->feat.arbitration.raw); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return get_features_generic(req, ctrlr->feat.power_management.raw); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return spdk_nvmf_ctrlr_get_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return get_features_generic(req, ctrlr->feat.error_recovery.raw); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return get_features_generic(req, ctrlr->feat.volatile_write_cache.raw); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return get_features_generic(req, ctrlr->feat.number_of_queues.raw); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return get_features_generic(req, ctrlr->feat.write_atomicity.raw); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return get_features_generic(req, ctrlr->feat.async_event_configuration.raw); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return get_features_generic(req, ctrlr->feat.keep_alive_timer.raw); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return spdk_nvmf_ctrlr_get_features_host_identifier(req); + default: + SPDK_ERRLOG("Get Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +spdk_nvmf_ctrlr_set_features(struct spdk_nvmf_request *req) +{ + uint8_t feature; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + feature = cmd->cdw10 & 0xff; /* mask out the FID value */ + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return spdk_nvmf_ctrlr_set_features_arbitration(req); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return spdk_nvmf_ctrlr_set_features_power_management(req); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return spdk_nvmf_ctrlr_set_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return spdk_nvmf_ctrlr_set_features_error_recovery(req); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return spdk_nvmf_ctrlr_set_features_volatile_write_cache(req); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return spdk_nvmf_ctrlr_set_features_number_of_queues(req); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return spdk_nvmf_ctrlr_set_features_write_atomicity(req); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return spdk_nvmf_ctrlr_set_features_async_event_configuration(req); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return spdk_nvmf_ctrlr_set_features_keep_alive_timer(req); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return spdk_nvmf_ctrlr_set_features_host_identifier(req); + default: + SPDK_ERRLOG("Set Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +spdk_nvmf_ctrlr_keep_alive(struct spdk_nvmf_request *req) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Keep Alive\n"); + /* + * To handle keep alive just clear or reset the + * ctrlr based keep alive duration counter. + * When added, a separate timer based process + * will monitor if the time since last recorded + * keep alive has exceeded the max duration and + * take appropriate action. + */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +spdk_nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + if (ctrlr == NULL) { + SPDK_ERRLOG("Admin command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->vcprop.cc.bits.en != 1) { + SPDK_ERRLOG("Admin command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data && spdk_nvme_opc_get_data_transfer(cmd->opc) == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + memset(req->data, 0, req->length); + } + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + /* Discovery controllers only support Get Log Page and Identify */ + switch (cmd->opc) { + case SPDK_NVME_OPC_IDENTIFY: + case SPDK_NVME_OPC_GET_LOG_PAGE: + break; + default: + goto invalid_opcode; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_GET_LOG_PAGE: + return spdk_nvmf_ctrlr_get_log_page(req); + case SPDK_NVME_OPC_IDENTIFY: + return spdk_nvmf_ctrlr_identify(req); + case SPDK_NVME_OPC_ABORT: + return spdk_nvmf_ctrlr_abort(req); + case SPDK_NVME_OPC_GET_FEATURES: + return spdk_nvmf_ctrlr_get_features(req); + case SPDK_NVME_OPC_SET_FEATURES: + return spdk_nvmf_ctrlr_set_features(req); + case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST: + return spdk_nvmf_ctrlr_async_event_request(req); + case SPDK_NVME_OPC_KEEP_ALIVE: + return spdk_nvmf_ctrlr_keep_alive(req); + + case SPDK_NVME_OPC_CREATE_IO_SQ: + case SPDK_NVME_OPC_CREATE_IO_CQ: + case SPDK_NVME_OPC_DELETE_IO_SQ: + case SPDK_NVME_OPC_DELETE_IO_CQ: + /* Create and Delete I/O CQ/SQ not allowed in NVMe-oF */ + goto invalid_opcode; + + default: + goto invalid_opcode; + } + +invalid_opcode: + SPDK_ERRLOG("Unsupported admin opcode 0x%x\n", cmd->opc); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +spdk_nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_capsule_cmd *cap_hdr; + + cap_hdr = &req->cmd->nvmf_cmd; + + if (qpair->ctrlr == NULL) { + /* No ctrlr established yet; the only valid command is Connect */ + if (cap_hdr->fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT) { + return spdk_nvmf_ctrlr_connect(req); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Got fctype 0x%x, expected Connect\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else if (spdk_nvmf_qpair_is_admin_queue(qpair)) { + /* + * Controller session is established, and this is an admin queue. + * Disallow Connect and allow other fabrics commands. + */ + switch (cap_hdr->fctype) { + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET: + return spdk_nvmf_property_set(req); + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET: + return spdk_nvmf_property_get(req); + default: + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "unknown fctype 0x%02x\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else { + /* Controller session is established, and this is an I/O queue */ + /* For now, no I/O-specific Fabrics commands are implemented (other than Connect) */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Unexpected I/O fctype 0x%x\n", cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +int +spdk_nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr) +{ + struct spdk_nvmf_request *req; + struct spdk_nvme_cpl *rsp; + union spdk_nvme_async_event_completion event = {0}; + + /* Users may disable the event notification */ + if (!ctrlr->feat.async_event_configuration.bits.ns_attr_notice) { + return 0; + } + + event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE; + event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED; + event.bits.log_page_identifier = SPDK_NVME_LOG_CHANGED_NS_LIST; + + /* If there is no outstanding AER request, queue the event. Then + * if an AER is later submitted, this event can be sent as a + * response. + */ + if (!ctrlr->aer_req) { + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + return 0; + } + + ctrlr->notice_event.raw = event.raw; + return 0; + } + + req = ctrlr->aer_req; + rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = event.raw; + + spdk_nvmf_request_complete(req); + ctrlr->aer_req = NULL; + + return 0; +} + +void +spdk_nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (!spdk_nvmf_qpair_is_admin_queue(qpair)) { + return; + } + + if (ctrlr->aer_req != NULL) { + spdk_nvmf_request_free(ctrlr->aer_req); + ctrlr->aer_req = NULL; + } +} + +void +spdk_nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr) +{ + if (!ctrlr->aer_req) { + return; + } + + spdk_nvmf_request_complete(ctrlr->aer_req); + ctrlr->aer_req = NULL; +} diff --git a/src/spdk/lib/nvmf/ctrlr_bdev.c b/src/spdk/lib/nvmf/ctrlr_bdev.c new file mode 100644 index 00000000..7eb4f19a --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_bdev.c @@ -0,0 +1,531 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" + +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/trace.h" +#include "spdk/scsi_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +static bool +spdk_nvmf_subsystem_bdev_io_type_supported(struct spdk_nvmf_subsystem *subsystem, + enum spdk_bdev_io_type io_type) +{ + struct spdk_nvmf_ns *ns; + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->bdev == NULL) { + continue; + } + + if (!spdk_bdev_io_type_supported(ns->bdev, io_type)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "Subsystem %s namespace %u (%s) does not support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + ns->opts.nsid, spdk_bdev_get_name(ns->bdev), (int)io_type); + return false; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "All devices in Subsystem %s support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), (int)io_type); + return true; +} + +bool +spdk_nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return spdk_nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_UNMAP); +} + +bool +spdk_nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return spdk_nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_WRITE_ZEROES); +} + +static void +nvmf_bdev_ctrlr_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int sc, sct; + + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + response->status.sc = sc; + response->status.sct = sct; + + spdk_nvmf_request_complete(req); + spdk_bdev_free_io(bdev_io); +} + +void +spdk_nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata) +{ + struct spdk_bdev *bdev = ns->bdev; + uint64_t num_blocks; + + num_blocks = spdk_bdev_get_num_blocks(bdev); + + nsdata->nsze = num_blocks; + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(bdev); + nsdata->nmic.can_share = 1; + + SPDK_STATIC_ASSERT(sizeof(nsdata->nguid) == sizeof(ns->opts.nguid), "size mismatch"); + memcpy(nsdata->nguid, ns->opts.nguid, sizeof(nsdata->nguid)); + + SPDK_STATIC_ASSERT(sizeof(nsdata->eui64) == sizeof(ns->opts.eui64), "size mismatch"); + memcpy(&nsdata->eui64, ns->opts.eui64, sizeof(nsdata->eui64)); +} + +static void +nvmf_bdev_ctrlr_get_rw_params(const struct spdk_nvme_cmd *cmd, uint64_t *start_lba, + uint64_t *num_blocks) +{ + /* SLBA: CDW10 and CDW11 */ + *start_lba = from_le64(&cmd->cdw10); + + /* NLB: CDW12 bits 15:00, 0's based */ + *num_blocks = (from_le32(&cmd->cdw12) & 0xFFFFu) + 1; +} + +static bool +nvmf_bdev_ctrlr_lba_in_range(uint64_t bdev_num_blocks, uint64_t io_start_lba, + uint64_t io_num_blocks) +{ + if (io_start_lba + io_num_blocks > bdev_num_blocks || + io_start_lba + io_num_blocks < io_start_lba) { + return false; + } + + return true; +} + +static void +spdk_nvmf_ctrlr_process_io_cmd_resubmit(void *arg) +{ + struct spdk_nvmf_request *req = arg; + + spdk_nvmf_ctrlr_process_io_cmd(req); +} + +static void +nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev, + struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn, void *cb_arg) +{ + int rc; + + req->bdev_io_wait.bdev = bdev; + req->bdev_io_wait.cb_fn = cb_fn; + req->bdev_io_wait.cb_arg = cb_arg; + + rc = spdk_bdev_queue_io_wait(bdev, ch, &req->bdev_io_wait); + if (rc != 0) { + assert(false); + } +} + +static int +nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_write_zeroes_blocks(desc, ch, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + /* As for NVMeoF controller, SPDK always set volatile write + * cache bit to 1, return success for those block devices + * which can't support FLUSH command. + */ + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(bdev), + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +struct nvmf_virtual_ctrlr_unmap { + struct spdk_nvmf_request *req; + uint32_t count; + struct spdk_bdev_desc *desc; + struct spdk_bdev *bdev; + struct spdk_io_channel *ch; +}; + +static void +nvmf_virtual_ctrlr_dsm_cpl(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct nvmf_virtual_ctrlr_unmap *unmap_ctx = cb_arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int sc, sct; + + unmap_ctx->count--; + + if (response->status.sct == SPDK_NVME_SCT_GENERIC && + response->status.sc == SPDK_NVME_SC_SUCCESS) { + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + response->status.sc = sc; + response->status.sct = sct; + } + + if (unmap_ctx->count == 0) { + spdk_nvmf_request_complete(req); + free(unmap_ctx); + } + spdk_bdev_free_io(bdev_io); +} + +static int +nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_virtual_ctrlr_unmap *unmap_ctx); +static void +nvmf_bdev_ctrlr_dsm_cmd_resubmit(void *arg) +{ + struct nvmf_virtual_ctrlr_unmap *unmap_ctx = arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_bdev_desc *desc = unmap_ctx->desc; + struct spdk_bdev *bdev = unmap_ctx->bdev; + struct spdk_io_channel *ch = unmap_ctx->ch; + + nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req, unmap_ctx); +} + +static int +nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_virtual_ctrlr_unmap *unmap_ctx) +{ + uint32_t attribute; + uint16_t nr, i; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + nr = ((cmd->cdw10 & 0x000000ff) + 1); + if (nr * sizeof(struct spdk_nvme_dsm_range) > req->length) { + SPDK_ERRLOG("Dataset Management number of ranges > SGL length\n"); + response->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + attribute = cmd->cdw11 & 0x00000007; + if (attribute & SPDK_NVME_DSM_ATTR_DEALLOCATE) { + struct spdk_nvme_dsm_range *dsm_range; + uint64_t lba; + uint32_t lba_count; + + if (unmap_ctx == NULL) { + unmap_ctx = calloc(1, sizeof(*unmap_ctx)); + if (!unmap_ctx) { + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + unmap_ctx->req = req; + unmap_ctx->desc = desc; + unmap_ctx->ch = ch; + } + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + + dsm_range = (struct spdk_nvme_dsm_range *)req->data; + for (i = unmap_ctx->count; i < nr; i++) { + lba = dsm_range[i].starting_lba; + lba_count = dsm_range[i].length; + + unmap_ctx->count++; + + rc = spdk_bdev_unmap_blocks(desc, ch, lba, lba_count, + nvmf_virtual_ctrlr_dsm_cpl, unmap_ctx); + if (rc) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_bdev_ctrlr_dsm_cmd_resubmit, unmap_ctx); + /* Unmap was not yet submitted to bdev */ + unmap_ctx->count--; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + unmap_ctx->count--; + /* We can't return here - we may have to wait for any other + * unmaps already sent to complete */ + break; + } + } + + if (unmap_ctx->count == 0) { + free(unmap_ctx); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + int rc; + + rc = spdk_bdev_nvme_io_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +spdk_nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req) +{ + uint32_t nsid; + struct spdk_nvmf_ns *ns; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group = req->qpair->group; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + /* pre-set response details for this command */ + response->status.sc = SPDK_NVME_SC_SUCCESS; + nsid = cmd->nsid; + + if (spdk_unlikely(ctrlr == NULL)) { + SPDK_ERRLOG("I/O command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(ctrlr->vcprop.cc.bits.en != 1)) { + SPDK_ERRLOG("I/O command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _spdk_nvmf_subsystem_get_ns(ctrlr->subsys, nsid); + if (ns == NULL || ns->bdev == NULL) { + SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid); + response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + response->status.dnr = 1; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + bdev = ns->bdev; + desc = ns->desc; + ch = group->sgroups[ctrlr->subsys->id].channels[nsid - 1]; + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + return nvmf_bdev_ctrlr_read_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE: + return nvmf_bdev_ctrlr_write_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE_ZEROES: + return nvmf_bdev_ctrlr_write_zeroes_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_FLUSH: + return nvmf_bdev_ctrlr_flush_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + return nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req, NULL); + default: + return nvmf_bdev_ctrlr_nvme_passthru_io(bdev, desc, ch, req); + } +} diff --git a/src/spdk/lib/nvmf/ctrlr_discovery.c b/src/spdk/lib/nvmf/ctrlr_discovery.c new file mode 100644 index 00000000..305a6076 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_discovery.c @@ -0,0 +1,144 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics discovery service + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/event.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static void +nvmf_update_discovery_log(struct spdk_nvmf_tgt *tgt) +{ + uint64_t numrec = 0; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_listener *listener; + struct spdk_nvmf_discovery_log_page_entry *entry; + struct spdk_nvmf_discovery_log_page *disc_log; + size_t cur_size; + uint32_t sid; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Generating log page for genctr %" PRIu64 "\n", + tgt->discovery_genctr); + + cur_size = sizeof(struct spdk_nvmf_discovery_log_page); + disc_log = calloc(1, cur_size); + if (disc_log == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + return; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem == NULL) { + continue; + } + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + continue; + } + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + size_t new_size = cur_size + sizeof(*entry); + void *new_log_page = realloc(disc_log, new_size); + + if (new_log_page == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + break; + } + + disc_log = new_log_page; + cur_size = new_size; + + entry = &disc_log->entries[numrec]; + memset(entry, 0, sizeof(*entry)); + entry->portid = numrec; + entry->cntlid = 0xffff; + entry->asqsz = listener->transport->opts.max_aq_depth; + entry->subtype = subsystem->subtype; + snprintf(entry->subnqn, sizeof(entry->subnqn), "%s", subsystem->subnqn); + + spdk_nvmf_transport_listener_discover(listener->transport, &listener->trid, entry); + + numrec++; + } + } + + disc_log->numrec = numrec; + disc_log->genctr = tgt->discovery_genctr; + + free(tgt->discovery_log_page); + + tgt->discovery_log_page = disc_log; + tgt->discovery_log_page_size = cur_size; +} + +void +spdk_nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, void *buffer, + uint64_t offset, uint32_t length) +{ + size_t copy_len = 0; + size_t zero_len = length; + + if (tgt->discovery_log_page == NULL || + tgt->discovery_log_page->genctr != tgt->discovery_genctr) { + nvmf_update_discovery_log(tgt); + } + + /* Copy the valid part of the discovery log page, if any */ + if (tgt->discovery_log_page && offset < tgt->discovery_log_page_size) { + copy_len = spdk_min(tgt->discovery_log_page_size - offset, length); + zero_len -= copy_len; + memcpy(buffer, (char *)tgt->discovery_log_page + offset, copy_len); + } + + /* Zero out the rest of the buffer */ + if (zero_len) { + memset((char *)buffer + copy_len, 0, zero_len); + } + + /* We should have copied or zeroed every byte of the output buffer. */ + assert(copy_len + zero_len == length); +} diff --git a/src/spdk/lib/nvmf/nvmf.c b/src/spdk/lib/nvmf/nvmf.c new file mode 100644 index 00000000..32539f53 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf.c @@ -0,0 +1,1173 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bit_array.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/nvmf.h" +#include "spdk/trace.h" +#include "spdk/endian.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "nvmf_internal.h" +#include "transport.h" + +SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF) + +#define SPDK_NVMF_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 +#define SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024 +#define SPDK_NVMF_DEFAULT_IO_UNIT_SIZE 131072 + +typedef void (*nvmf_qpair_disconnect_cpl)(void *ctx, int status); +static void spdk_nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf); + +/* supplied to a single call to nvmf_qpair_disconnect */ +struct nvmf_qpair_disconnect_ctx { + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_ctrlr *ctrlr; + nvmf_qpair_disconnect_cb cb_fn; + struct spdk_thread *thread; + void *ctx; + uint16_t qid; +}; + +/* + * There are several times when we need to iterate through the list of all qpairs and selectively delete them. + * In order to do this sequentially without overlap, we must provide a context to recover the next qpair from + * to enable calling nvmf_qpair_disconnect on the next desired qpair. + */ +struct nvmf_qpair_disconnect_many_ctx { + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + spdk_nvmf_poll_group_mod_done cpl_fn; + void *cpl_ctx; +}; + +static void +spdk_nvmf_qpair_set_state(struct spdk_nvmf_qpair *qpair, + enum spdk_nvmf_qpair_state state) +{ + assert(qpair != NULL); + assert(qpair->group->thread == spdk_get_thread()); + + qpair->state = state; +} + +void +spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_DEFAULT_MAX_IO_SIZE; + opts->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS; + opts->io_unit_size = SPDK_NVMF_DEFAULT_IO_UNIT_SIZE; +} + +static int +spdk_nvmf_poll_group_poll(void *ctx) +{ + struct spdk_nvmf_poll_group *group = ctx; + int rc; + int count = 0; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + rc = spdk_nvmf_transport_poll_group_poll(tgroup); + if (rc < 0) { + return -1; + } + count += rc; + } + + return count; +} + +static int +spdk_nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport *transport; + uint32_t sid; + + TAILQ_INIT(&group->tgroups); + TAILQ_INIT(&group->qpairs); + + TAILQ_FOREACH(transport, &tgt->transports, link) { + spdk_nvmf_poll_group_add_transport(group, transport); + } + + group->num_sgroups = tgt->opts.max_subsystems; + group->sgroups = calloc(tgt->opts.max_subsystems, sizeof(struct spdk_nvmf_subsystem_poll_group)); + if (!group->sgroups) { + return -1; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + struct spdk_nvmf_subsystem *subsystem; + + subsystem = tgt->subsystems[sid]; + if (!subsystem) { + continue; + } + + if (spdk_nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) { + spdk_nvmf_tgt_destroy_poll_group(io_device, ctx_buf); + return -1; + } + } + + group->poller = spdk_poller_register(spdk_nvmf_poll_group_poll, group, 0); + group->thread = spdk_get_thread(); + + return 0; +} + +static void +spdk_nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport_poll_group *tgroup, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t sid, nsid; + + TAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp) { + TAILQ_REMOVE(&group->tgroups, tgroup, link); + spdk_nvmf_transport_poll_group_destroy(tgroup); + } + + for (sid = 0; sid < group->num_sgroups; sid++) { + sgroup = &group->sgroups[sid]; + + for (nsid = 0; nsid < sgroup->num_channels; nsid++) { + if (sgroup->channels[nsid]) { + spdk_put_io_channel(sgroup->channels[nsid]); + sgroup->channels[nsid] = NULL; + } + } + + free(sgroup->channels); + } + + free(group->sgroups); +} + +static void +_nvmf_tgt_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_poll_group *group = qpair_ctx->group; + struct spdk_io_channel *ch; + int rc = 0; + + qpair = TAILQ_FIRST(&group->qpairs); + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx); + } + + if (!qpair || rc != 0) { + /* When the refcount from the channels reaches 0, spdk_nvmf_tgt_destroy_poll_group will be called. */ + ch = spdk_io_channel_from_ctx(group); + spdk_put_io_channel(ch); + free(qpair_ctx); + } +} + +static void +spdk_nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group) +{ + struct nvmf_qpair_disconnect_many_ctx *ctx; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Failed to allocate memory for destroy poll group ctx\n"); + return; + } + + spdk_poller_unregister(&group->poller); + + ctx->group = group; + _nvmf_tgt_disconnect_next_qpair(ctx); +} + +struct spdk_nvmf_tgt * +spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts) +{ + struct spdk_nvmf_tgt *tgt; + + tgt = calloc(1, sizeof(*tgt)); + if (!tgt) { + return NULL; + } + + if (!opts) { + spdk_nvmf_tgt_opts_init(&tgt->opts); + } else { + tgt->opts = *opts; + } + + tgt->discovery_genctr = 0; + tgt->discovery_log_page = NULL; + tgt->discovery_log_page_size = 0; + TAILQ_INIT(&tgt->transports); + + tgt->subsystems = calloc(tgt->opts.max_subsystems, sizeof(struct spdk_nvmf_subsystem *)); + if (!tgt->subsystems) { + free(tgt); + return NULL; + } + + spdk_io_device_register(tgt, + spdk_nvmf_tgt_create_poll_group, + spdk_nvmf_tgt_destroy_poll_group, + sizeof(struct spdk_nvmf_poll_group), + "nvmf_tgt"); + + return tgt; +} + +static void +spdk_nvmf_tgt_destroy_cb(void *io_device) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_transport *transport, *transport_tmp; + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; + uint32_t i; + + if (tgt->discovery_log_page) { + free(tgt->discovery_log_page); + } + + if (tgt->subsystems) { + for (i = 0; i < tgt->opts.max_subsystems; i++) { + if (tgt->subsystems[i]) { + spdk_nvmf_subsystem_destroy(tgt->subsystems[i]); + } + } + free(tgt->subsystems); + } + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, transport_tmp) { + TAILQ_REMOVE(&tgt->transports, transport, link); + spdk_nvmf_transport_destroy(transport); + } + + destroy_cb_fn = tgt->destroy_cb_fn; + destroy_cb_arg = tgt->destroy_cb_arg; + + free(tgt); + + if (destroy_cb_fn) { + destroy_cb_fn(destroy_cb_arg, 0); + } +} + +void +spdk_nvmf_tgt_destroy(struct spdk_nvmf_tgt *tgt, + spdk_nvmf_tgt_destroy_done_fn cb_fn, + void *cb_arg) +{ + tgt->destroy_cb_fn = cb_fn; + tgt->destroy_cb_arg = cb_arg; + + spdk_io_device_unregister(tgt, spdk_nvmf_tgt_destroy_cb); +} + +static void +spdk_nvmf_write_subsystem_config_json(struct spdk_json_write_ctx *w, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host; + struct spdk_nvmf_listener *listener; + const struct spdk_nvme_transport_id *trid; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_ns_opts ns_opts; + uint32_t max_namespaces; + char uuid_str[SPDK_UUID_STRING_LEN]; + const char *trtype; + const char *adrfam; + + if (spdk_nvmf_subsystem_get_type(subsystem) != SPDK_NVMF_SUBTYPE_NVME) { + return; + } + + /* { */ + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_create"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_bool(w, "allow_any_host", spdk_nvmf_subsystem_get_allow_any_host(subsystem)); + spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem)); + + max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem); + if (max_namespaces != 0) { + spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces); + } + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + trid = spdk_nvmf_listener_get_trid(listener); + + trtype = spdk_nvme_transport_id_trtype_str(trid->trtype); + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_listener"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "listen_address" : { */ + spdk_json_write_named_object_begin(w, "listen_address"); + + spdk_json_write_named_string(w, "trtype", trtype); + if (adrfam) { + spdk_json_write_named_string(w, "adrfam", adrfam); + } + + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + /* } "listen_address" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL; + host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) { + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_host"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_string(w, "host", spdk_nvmf_host_get_nqn(host)); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts)); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_ns"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "namespace" : { */ + spdk_json_write_named_object_begin(w, "namespace"); + + spdk_json_write_named_uint32(w, "nsid", spdk_nvmf_ns_get_id(ns)); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + + if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(uint64_t) * 2, "size mismatch"); + spdk_json_write_named_string_fmt(w, "nguid", "%016"PRIX64"%016"PRIX64, from_be64(&ns_opts.nguid[0]), + from_be64(&ns_opts.nguid[8])); + } + + if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(uint64_t), "size mismatch"); + spdk_json_write_named_string_fmt(w, "eui64", "%016"PRIX64, from_be64(&ns_opts.eui64)); + } + + if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) { + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + /* "namespace" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } +} + +void +spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_transport *transport; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_nvmf_target_options"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint32(w, "max_queue_depth", tgt->opts.max_queue_depth); + spdk_json_write_named_uint32(w, "max_qpairs_per_ctrlr", tgt->opts.max_qpairs_per_ctrlr); + spdk_json_write_named_uint32(w, "in_capsule_data_size", tgt->opts.in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", tgt->opts.max_io_size); + spdk_json_write_named_uint32(w, "max_subsystems", tgt->opts.max_subsystems); + spdk_json_write_named_uint32(w, "io_unit_size", tgt->opts.io_unit_size); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + /* write transports */ + TAILQ_FOREACH(transport, &tgt->transports, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_create_transport"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "trtype", spdk_nvme_transport_id_trtype_str(transport->ops->type)); + spdk_json_write_named_uint32(w, "max_queue_depth", transport->opts.max_queue_depth); + spdk_json_write_named_uint32(w, "max_qpairs_per_ctrlr", transport->opts.max_qpairs_per_ctrlr); + spdk_json_write_named_uint32(w, "in_capsule_data_size", transport->opts.in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", transport->opts.max_io_size); + spdk_json_write_named_uint32(w, "io_unit_size", transport->opts.io_unit_size); + spdk_json_write_named_uint32(w, "max_aq_depth", transport->opts.max_aq_depth); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + subsystem = spdk_nvmf_subsystem_get_first(tgt); + while (subsystem) { + spdk_nvmf_write_subsystem_config_json(w, subsystem); + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } +} + +void +spdk_nvmf_tgt_listen(struct spdk_nvmf_tgt *tgt, + struct spdk_nvme_transport_id *trid, + spdk_nvmf_tgt_listen_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_transport *transport; + int rc; + bool propagate = false; + + transport = spdk_nvmf_tgt_get_transport(tgt, trid->trtype); + if (!transport) { + struct spdk_nvmf_transport_opts opts; + + opts.max_queue_depth = tgt->opts.max_queue_depth; + opts.max_qpairs_per_ctrlr = tgt->opts.max_qpairs_per_ctrlr; + opts.in_capsule_data_size = tgt->opts.in_capsule_data_size; + opts.max_io_size = tgt->opts.max_io_size; + opts.io_unit_size = tgt->opts.io_unit_size; + /* use max_queue depth since tgt. opts. doesn't have max_aq_depth */ + opts.max_aq_depth = tgt->opts.max_queue_depth; + + transport = spdk_nvmf_transport_create(trid->trtype, &opts); + if (!transport) { + SPDK_ERRLOG("Transport initialization failed\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + propagate = true; + } + + rc = spdk_nvmf_transport_listen(transport, trid); + if (rc < 0) { + SPDK_ERRLOG("Unable to listen on address '%s'\n", trid->traddr); + cb_fn(cb_arg, rc); + return; + } + + tgt->discovery_genctr++; + + if (propagate) { + spdk_nvmf_tgt_add_transport(tgt, transport, cb_fn, cb_arg); + } else { + cb_fn(cb_arg, 0); + } +} + +struct spdk_nvmf_tgt_add_transport_ctx { + struct spdk_nvmf_tgt *tgt; + struct spdk_nvmf_transport *transport; + spdk_nvmf_tgt_add_transport_done_fn cb_fn; + void *cb_arg; +}; + +static void +_spdk_nvmf_tgt_add_transport_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cb_fn(ctx->cb_arg, status); + + free(ctx); +} + +static void +_spdk_nvmf_tgt_add_transport(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + int rc; + + rc = spdk_nvmf_poll_group_add_transport(group, ctx->transport); + spdk_for_each_channel_continue(i, rc); +} + +void spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_transport *transport, + spdk_nvmf_tgt_add_transport_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx; + + if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->type)) { + cb_fn(cb_arg, -EEXIST); + return; /* transport already created */ + } + + transport->tgt = tgt; + TAILQ_INSERT_TAIL(&tgt->transports, transport, link); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->tgt = tgt; + ctx->transport = transport; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(tgt, + _spdk_nvmf_tgt_add_transport, + ctx, + _spdk_nvmf_tgt_add_transport_done); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_tgt_find_subsystem(struct spdk_nvmf_tgt *tgt, const char *subnqn) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (!subnqn) { + return NULL; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem == NULL) { + continue; + } + + if (strcmp(subnqn, subsystem->subnqn) == 0) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_transport * +spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, enum spdk_nvme_transport_type type) +{ + struct spdk_nvmf_transport *transport; + + TAILQ_FOREACH(transport, &tgt->transports, link) { + if (transport->ops->type == type) { + return transport; + } + } + + return NULL; +} + +void +spdk_nvmf_tgt_accept(struct spdk_nvmf_tgt *tgt, new_qpair_fn cb_fn) +{ + struct spdk_nvmf_transport *transport, *tmp; + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, tmp) { + spdk_nvmf_transport_accept(transport, cb_fn); + } +} + +struct spdk_nvmf_poll_group * +spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_io_channel *ch; + + ch = spdk_get_io_channel(tgt); + if (!ch) { + SPDK_ERRLOG("Unable to get I/O channel for target\n"); + return NULL; + } + + return spdk_io_channel_get_ctx(ch); +} + +void +spdk_nvmf_poll_group_destroy(struct spdk_nvmf_poll_group *group) +{ + /* This function will put the io_channel associated with this poll group */ + spdk_nvmf_tgt_destroy_poll_group_qpairs(group); +} + +int +spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + int rc = -1; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_INIT(&qpair->outstanding); + qpair->group = group; + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVATING); + + TAILQ_INSERT_TAIL(&group->qpairs, qpair, link); + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + rc = spdk_nvmf_transport_poll_group_add(tgroup, qpair); + break; + } + } + + if (rc == 0) { + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE); + } else { + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_INACTIVE); + } + + return rc; +} + +static +void _nvmf_ctrlr_destruct(void *ctx) +{ + struct spdk_nvmf_ctrlr *ctrlr = ctx; + + spdk_nvmf_ctrlr_destruct(ctrlr); +} + +static void +_spdk_nvmf_ctrlr_free_from_qpair(void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_ctrlr *ctrlr = qpair_ctx->ctrlr; + uint32_t count; + + spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid); + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + if (count == 0) { + spdk_bit_array_free(&ctrlr->qpair_mask); + + spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr); + } + + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); +} + +static void +_spdk_nvmf_qpair_destroy(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_qpair *qpair = qpair_ctx->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + assert(qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING); + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_INACTIVE); + qpair_ctx->qid = qpair->qid; + + TAILQ_REMOVE(&qpair->group->qpairs, qpair, link); + qpair->group = NULL; + + spdk_nvmf_transport_qpair_fini(qpair); + + if (!ctrlr || !ctrlr->thread) { + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); + return; + } + + qpair_ctx->ctrlr = ctrlr; + spdk_thread_send_msg(ctrlr->thread, _spdk_nvmf_ctrlr_free_from_qpair, qpair_ctx); + +} + +int +spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx; + + /* If we get a qpair in the uninitialized state, we can just destroy it immediately */ + if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) { + spdk_nvmf_transport_qpair_fini(qpair); + if (cb_fn) { + cb_fn(ctx); + } + return 0; + } + + /* The queue pair must be disconnected from the thread that owns it */ + assert(qpair->group->thread == spdk_get_thread()); + + if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING || + qpair->state == SPDK_NVMF_QPAIR_INACTIVE) { + /* This can occur if the connection is killed by the target, + * which results in a notification that the connection + * died. Send a message to defer the processing of this + * callback. This allows the stack to unwind in the case + * where a bunch of connections are disconnected in + * a loop. */ + if (cb_fn) { + spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx); + } + return 0; + } + + assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE); + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING); + + qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx)); + if (!qpair_ctx) { + SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n"); + return -ENOMEM; + } + + qpair_ctx->qpair = qpair; + qpair_ctx->cb_fn = cb_fn; + qpair_ctx->thread = qpair->group->thread; + qpair_ctx->ctx = ctx; + + /* Check for outstanding I/O */ + if (!TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb = _spdk_nvmf_qpair_destroy; + qpair->state_cb_arg = qpair_ctx; + spdk_nvmf_qpair_free_aer(qpair); + return 0; + } + + _spdk_nvmf_qpair_destroy(qpair_ctx, 0); + + return 0; +} + +int +spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_peer_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_local_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_listen_trid(qpair, trid); +} + +int +spdk_nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == transport) { + /* Transport already in the poll group */ + return 0; + } + } + + tgroup = spdk_nvmf_transport_poll_group_create(transport); + if (!tgroup) { + SPDK_ERRLOG("Unable to create poll group for transport\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + + return 0; +} + +static int +poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t new_num_channels, old_num_channels; + uint32_t i; + struct spdk_nvmf_ns *ns; + + /* Make sure our poll group has memory for this subsystem allocated */ + if (subsystem->id >= group->num_sgroups) { + return -ENOMEM; + } + + sgroup = &group->sgroups[subsystem->id]; + + /* Make sure the array of channels is the correct size */ + new_num_channels = subsystem->max_nsid; + old_num_channels = sgroup->num_channels; + + if (old_num_channels == 0) { + if (new_num_channels > 0) { + /* First allocation */ + sgroup->channels = calloc(new_num_channels, sizeof(sgroup->channels[0])); + if (!sgroup->channels) { + return -ENOMEM; + } + } + } else if (new_num_channels > old_num_channels) { + void *buf; + + /* Make the array larger */ + buf = realloc(sgroup->channels, new_num_channels * sizeof(sgroup->channels[0])); + if (!buf) { + return -ENOMEM; + } + + sgroup->channels = buf; + + /* Null out the new channels slots */ + for (i = old_num_channels; i < new_num_channels; i++) { + sgroup->channels[i] = NULL; + } + } else if (new_num_channels < old_num_channels) { + void *buf; + + /* Free the extra I/O channels */ + for (i = new_num_channels; i < old_num_channels; i++) { + if (sgroup->channels[i]) { + spdk_put_io_channel(sgroup->channels[i]); + sgroup->channels[i] = NULL; + } + } + + /* Make the array smaller */ + if (new_num_channels > 0) { + buf = realloc(sgroup->channels, new_num_channels * sizeof(sgroup->channels[0])); + if (!buf) { + return -ENOMEM; + } + sgroup->channels = buf; + } else { + free(sgroup->channels); + sgroup->channels = NULL; + } + } + + sgroup->num_channels = new_num_channels; + + /* Detect bdevs that were added or removed */ + for (i = 0; i < sgroup->num_channels; i++) { + ns = subsystem->ns[i]; + if (ns == NULL && sgroup->channels[i] == NULL) { + /* Both NULL. Leave empty */ + } else if (ns == NULL && sgroup->channels[i] != NULL) { + /* There was a channel here, but the namespace is gone. */ + spdk_put_io_channel(sgroup->channels[i]); + sgroup->channels[i] = NULL; + } else if (ns != NULL && sgroup->channels[i] == NULL) { + /* A namespace appeared but there is no channel yet */ + sgroup->channels[i] = spdk_bdev_get_io_channel(ns->desc); + if (sgroup->channels[i] == NULL) { + SPDK_ERRLOG("Could not allocate I/O channel.\n"); + return -ENOMEM; + } + } else { + /* A namespace was present before and didn't change. */ + } + } + + return 0; +} + +int +spdk_nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + return poll_group_update_subsystem(group, subsystem); +} + +int +spdk_nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + int rc = 0; + struct spdk_nvmf_subsystem_poll_group *sgroup = &group->sgroups[subsystem->id]; + + TAILQ_INIT(&sgroup->queued); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + spdk_nvmf_poll_group_remove_subsystem(group, subsystem, NULL, NULL); + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } + + return rc; +} + +static void +_nvmf_poll_group_remove_subsystem_cb(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem_poll_group *sgroup; + spdk_nvmf_poll_group_mod_done cpl_fn = NULL; + void *cpl_ctx = NULL; + uint32_t nsid; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + cpl_fn = qpair_ctx->cpl_fn; + cpl_ctx = qpair_ctx->cpl_ctx; + sgroup = &group->sgroups[subsystem->id]; + + if (status) { + goto fini; + } + + for (nsid = 0; nsid < sgroup->num_channels; nsid++) { + if (sgroup->channels[nsid]) { + spdk_put_io_channel(sgroup->channels[nsid]); + sgroup->channels[nsid] = NULL; + } + } + + sgroup->num_channels = 0; + free(sgroup->channels); + sgroup->channels = NULL; +fini: + free(qpair_ctx); + if (cpl_fn) { + cpl_fn(cpl_ctx, status); + } +} + +static void +_nvmf_subsystem_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + int rc = 0; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr->subsys == subsystem) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, qpair_ctx); + } + + if (!qpair || rc != 0) { + _nvmf_poll_group_remove_subsystem_cb(ctx, rc); + } + return; +} + +void +spdk_nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup; + struct nvmf_qpair_disconnect_many_ctx *ctx; + int rc = 0; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Unable to allocate memory for context to remove poll subsystem\n"); + goto fini; + } + + ctx->group = group; + ctx->subsystem = subsystem; + ctx->cpl_fn = cb_fn; + ctx->cpl_ctx = cb_arg; + + sgroup = &group->sgroups[subsystem->id]; + sgroup->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr->subsys == subsystem) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, ctx); + } else { + /* call the callback immediately. It will handle any channel iteration */ + _nvmf_poll_group_remove_subsystem_cb(ctx, 0); + } + + if (rc != 0) { + free(ctx); + goto fini; + } + + return; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +spdk_nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + if (sgroup == NULL) { + rc = -1; + goto fini; + } + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE); + /* TODO: This currently does not quiesce I/O */ + sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +spdk_nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_request *req, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + + /* Release all queued requests */ + TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) { + TAILQ_REMOVE(&sgroup->queued, req, link); + spdk_nvmf_request_exec(req); + } +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} diff --git a/src/spdk/lib/nvmf/nvmf_fc.h b/src/spdk/lib/nvmf/nvmf_fc.h new file mode 100644 index 00000000..bf086831 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_fc.h @@ -0,0 +1,871 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2018 Broadcom. All Rights Reserved. + * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_FC_H__ +#define __NVMF_FC_H__ + +#include "spdk/nvmf.h" +#include "spdk/assert.h" +#include "spdk/nvme_spec.h" +#include "spdk/nvmf_fc_spec.h" +#include "spdk/event.h" +#include "spdk/io_channel.h" +#include "nvmf_internal.h" + +#define SPDK_NVMF_FC_TR_ADDR_LEN 64 + +/* + * FC HW port states. + */ +enum spdk_fc_port_state { + SPDK_FC_PORT_OFFLINE = 0, + SPDK_FC_PORT_ONLINE = 1, + SPDK_FC_PORT_QUIESCED = 2, +}; + +enum spdk_fc_hwqp_state { + SPDK_FC_HWQP_OFFLINE = 0, + SPDK_FC_HWQP_ONLINE = 1, +}; + +/* + * NVMF BCM FC Object state + * Add all the generic states of the object here. + * Specific object states can be added separately + */ +enum spdk_nvmf_fc_object_state { + SPDK_NVMF_FC_OBJECT_CREATED = 0, + SPDK_NVMF_FC_OBJECT_TO_BE_DELETED = 1, + SPDK_NVMF_FC_OBJECT_ZOMBIE = 2, /* Partial Create or Delete */ +}; + +/* + * FC request state + */ +enum spdk_nvmf_fc_request_state { + SPDK_NVMF_FC_REQ_INIT = 0, + SPDK_NVMF_FC_REQ_READ_BDEV, + SPDK_NVMF_FC_REQ_READ_XFER, + SPDK_NVMF_FC_REQ_READ_RSP, + SPDK_NVMF_FC_REQ_WRITE_BUFFS, + SPDK_NVMF_FC_REQ_WRITE_XFER, + SPDK_NVMF_FC_REQ_WRITE_BDEV, + SPDK_NVMF_FC_REQ_WRITE_RSP, + SPDK_NVMF_FC_REQ_NONE_BDEV, + SPDK_NVMF_FC_REQ_NONE_RSP, + SPDK_NVMF_FC_REQ_SUCCESS, + SPDK_NVMF_FC_REQ_FAILED, + SPDK_NVMF_FC_REQ_ABORTED, + SPDK_NVMF_FC_REQ_PENDING, + SPDK_NVMF_FC_REQ_MAX_STATE, +}; + +/* + * FC HWQP pointer + */ +typedef void *spdk_nvmf_fc_lld_hwqp_t; + +/* + * FC World Wide Name + */ +struct spdk_nvmf_fc_wwn { + union { + uint64_t wwn; /* World Wide Names consist of eight bytes */ + uint8_t octets[sizeof(uint64_t)]; + } u; +}; + +/* + * Generic DMA buffer descriptor + */ +struct spdk_nvmf_fc_buffer_desc { + void *virt; + uint64_t phys; + size_t len; + + /* Internal */ + uint32_t buf_index; +}; + +/* + * ABTS hadling context + */ +struct spdk_nvmf_fc_abts_ctx { + bool handled; + uint16_t hwqps_responded; + uint16_t rpi; + uint16_t oxid; + uint16_t rxid; + struct spdk_nvmf_fc_nport *nport; + uint16_t nport_hdl; + uint8_t port_hdl; + void *abts_poller_args; + void *sync_poller_args; + int num_hwqps; + bool queue_synced; + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *ls_hwqp; + uint16_t fcp_rq_id; +}; + +/* + * NVME FC transport errors + */ +struct spdk_nvmf_fc_errors { + uint32_t no_xri; + uint32_t nport_invalid; + uint32_t unknown_frame; + uint32_t wqe_cmplt_err; + uint32_t wqe_write_err; + uint32_t rq_status_err; + uint32_t rq_buf_len_err; + uint32_t rq_id_err; + uint32_t rq_index_err; + uint32_t invalid_cq_type; + uint32_t invalid_cq_id; + uint32_t fc_req_buf_err; + uint32_t aq_buf_alloc_err; + uint32_t write_buf_alloc_err; + uint32_t read_buf_alloc_err; + uint32_t unexpected_err; + uint32_t nvme_cmd_iu_err; + uint32_t nvme_cmd_xfer_err; + uint32_t queue_entry_invalid; + uint32_t invalid_conn_err; + uint32_t fcp_rsp_failure; + uint32_t write_failed; + uint32_t read_failed; + uint32_t rport_invalid; + uint32_t num_aborted; + uint32_t num_abts_sent; +}; + +/* + * Send Single Request/Response Sequence. + */ +struct spdk_nvmf_fc_send_srsr { + struct spdk_nvmf_fc_buffer_desc rqst; + struct spdk_nvmf_fc_buffer_desc rsp; + struct spdk_nvmf_fc_buffer_desc sgl; /* Note: Len = (2 * bcm_sge_t) */ + uint16_t rpi; +}; + +/* + * Struct representing a nport + */ +struct spdk_nvmf_fc_nport { + + uint16_t nport_hdl; + uint8_t port_hdl; + uint32_t d_id; + enum spdk_nvmf_fc_object_state nport_state; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + + /* list of remote ports (i.e. initiators) connected to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_remote_port_info) rem_port_list; + uint32_t rport_count; + + void *vendor_data; /* available for vendor use */ + + /* list of associations to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_association) fc_associations; + uint32_t assoc_count; + struct spdk_nvmf_fc_port *fc_port; + TAILQ_ENTRY(spdk_nvmf_fc_nport) link; /* list of nports on a hw port. */ +}; + +/* + * NVMF FC Connection + */ +struct spdk_nvmf_fc_conn { + struct spdk_nvmf_qpair qpair; + + uint64_t conn_id; + struct spdk_nvmf_fc_hwqp *hwqp; + uint16_t esrp_ratio; + uint16_t rsp_count; + uint32_t rsn; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + uint16_t max_rw_depth; + /* The current number of I/O outstanding on this connection. This number + * includes all I/O from the time the capsule is first received until it is + * completed. + */ + uint16_t cur_queue_depth; + + /* number of read/write requests that are outstanding */ + uint16_t cur_fc_rw_depth; + + /* requests that are waiting to obtain xri/buffer */ + TAILQ_HEAD(, spdk_nvmf_fc_request) pending_queue; + + struct spdk_nvmf_fc_association *fc_assoc; + + /* additional FC info here - TBD */ + uint16_t rpi; + + /* for association's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_link; + + /* for assocations's available connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_avail_link; + + /* for hwqp's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) link; +}; + +/* + * Structure for maintaining the XRI's + */ +struct spdk_nvmf_fc_xri { + uint32_t xri; /* The actual xri value */ + /* Internal */ + TAILQ_ENTRY(spdk_nvmf_fc_xri) link; + bool is_active; +}; + +struct spdk_nvmf_fc_poll_group; + +/* + * HWQP poller structure passed from Master thread + */ +struct spdk_nvmf_fc_hwqp { + uint32_t lcore_id; /* core hwqp is running on (for tracing purposes only) */ + struct spdk_thread *thread; /* thread hwqp is running on */ + uint32_t hwqp_id; /* A unique id (per physical port) for a hwqp */ + uint32_t rq_size; /* receive queue size */ + spdk_nvmf_fc_lld_hwqp_t queues; /* vendor HW queue set */ + struct spdk_nvmf_fc_port *fc_port; /* HW port structure for these queues */ + struct spdk_nvmf_fc_poll_group *poll_group; + + void *context; /* Vendor Context */ + + TAILQ_HEAD(, spdk_nvmf_fc_conn) connection_list; + uint32_t num_conns; /* number of connections to queue */ + uint16_t cid_cnt; /* used to generate unique conn. id for RQ */ + uint32_t free_q_slots; /* free q slots available for connections */ + enum spdk_fc_hwqp_state state; /* Poller state (e.g. online, offline) */ + + /* Internal */ + struct spdk_mempool *fc_request_pool; + TAILQ_HEAD(, spdk_nvmf_fc_request) in_use_reqs; + + TAILQ_HEAD(, spdk_nvmf_fc_xri) pending_xri_list; + + struct spdk_nvmf_fc_errors counters; + uint32_t send_frame_xri; + uint8_t send_frame_seqid; + + /* Pending LS request waiting for XRI. */ + TAILQ_HEAD(, spdk_nvmf_fc_ls_rqst) ls_pending_queue; + + /* Sync req list */ + TAILQ_HEAD(, spdk_nvmf_fc_poller_api_queue_sync_args) sync_cbs; + + TAILQ_ENTRY(spdk_nvmf_fc_hwqp) link; +}; + +struct spdk_nvmf_fc_ls_rsrc_pool { + void *assocs_mptr; + uint32_t assocs_count; + TAILQ_HEAD(, spdk_nvmf_fc_association) assoc_free_list; + + void *conns_mptr; + uint32_t conns_count; + TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conn_free_list; +}; + +/* + * FC HW port. + */ +struct spdk_nvmf_fc_port { + uint8_t port_hdl; + enum spdk_fc_port_state hw_port_status; + uint32_t xri_base; + uint32_t xri_count; + uint16_t fcp_rq_id; + struct spdk_ring *xri_ring; + struct spdk_nvmf_fc_hwqp ls_queue; + uint32_t num_io_queues; + struct spdk_nvmf_fc_hwqp *io_queues; + /* + * List of nports on this HW port. + */ + TAILQ_HEAD(, spdk_nvmf_fc_nport)nport_list; + int num_nports; + TAILQ_ENTRY(spdk_nvmf_fc_port) link; + + struct spdk_nvmf_fc_ls_rsrc_pool ls_rsrc_pool; + struct spdk_mempool *io_rsrc_pool; /* Pools to store bdev_io's for this port */ + void *port_ctx; +}; + +/* + * NVMF FC Request + */ +struct spdk_nvmf_fc_request { + struct spdk_nvmf_request req; + struct spdk_nvmf_fc_ersp_iu ersp; + uint32_t poller_lcore; /* for tracing purposes only */ + struct spdk_thread *poller_thread; + uint16_t buf_index; + struct spdk_nvmf_fc_xri *xri; + uint16_t oxid; + uint16_t rpi; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + int state; + uint32_t transfered_len; + bool is_aborted; + uint32_t magic; + uint32_t s_id; + uint32_t d_id; + TAILQ_ENTRY(spdk_nvmf_fc_request) link; + TAILQ_ENTRY(spdk_nvmf_fc_request) pending_link; + TAILQ_HEAD(, spdk_nvmf_fc_caller_ctx) abort_cbs; +}; + +SPDK_STATIC_ASSERT(!offsetof(struct spdk_nvmf_fc_request, req), + "FC request and NVMF request address don't match."); + +/* + * NVMF FC Association + */ +struct spdk_nvmf_fc_association { + uint64_t assoc_id; + uint32_t s_id; + struct spdk_nvmf_fc_nport *tgtport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_host *host; + enum spdk_nvmf_fc_object_state assoc_state; + + char host_id[FCNVME_ASSOC_HOSTID_LEN]; + char host_nqn[FCNVME_ASSOC_HOSTNQN_LEN]; + char sub_nqn[FCNVME_ASSOC_HOSTNQN_LEN]; + + struct spdk_nvmf_fc_conn *aq_conn; /* connection for admin queue */ + + uint16_t conn_count; + TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conns; + + void *conns_buf; + TAILQ_HEAD(, spdk_nvmf_fc_conn) avail_fc_conns; + + TAILQ_ENTRY(spdk_nvmf_fc_association) link; + + /* for port's association free list */ + TAILQ_ENTRY(spdk_nvmf_fc_association) port_free_assoc_list_link; + + void *ls_del_op_ctx; /* delete assoc. callback list */ + + /* req/resp buffers used to send disconnect to initiator */ + struct spdk_nvmf_fc_send_srsr snd_disconn_bufs; +}; + +/* + * FC Remote Port + */ +struct spdk_nvmf_fc_remote_port_info { + uint32_t s_id; + uint32_t rpi; + uint32_t assoc_count; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + enum spdk_nvmf_fc_object_state rport_state; + TAILQ_ENTRY(spdk_nvmf_fc_remote_port_info) link; +}; + +/* + * Poller API error codes + */ +enum spdk_nvmf_fc_poller_api_ret { + SPDK_NVMF_FC_POLLER_API_SUCCESS = 0, + SPDK_NVMF_FC_POLLER_API_ERROR, + SPDK_NVMF_FC_POLLER_API_INVALID_ARG, + SPDK_NVMF_FC_POLLER_API_NO_CONN_ID, + SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID, + SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND, +}; + +/* + * Poller API definitions + */ +enum spdk_nvmf_fc_poller_api { + SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, + SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION, + SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED, + SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT, + SPDK_NVMF_FC_POLLER_API_AEN, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE, +}; + +/* + * Poller API callback function proto + */ +typedef void (*spdk_nvmf_fc_poller_api_cb)(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret); + +/* + * Poller API callback data + */ +struct spdk_nvmf_fc_poller_api_cb_info { + spdk_nvmf_fc_poller_api_cb cb_func; + void *cb_data; + enum spdk_nvmf_fc_poller_api_ret ret; +}; + +/* + * Poller API structures + */ +struct spdk_nvmf_fc_poller_api_add_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_del_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + bool send_abts; + /* internal */ + int fc_request_cnt; +}; + +struct spdk_nvmf_fc_poller_api_quiesce_queue_args { + void *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_activate_queue_args { + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_abts_recvd_args { + struct spdk_nvmf_fc_abts_ctx *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_queue_sync_done_args { + struct spdk_nvmf_fc_hwqp *hwqp; + uint64_t tag; +}; + +/* + * NVMF LS request structure + */ +struct spdk_nvmf_fc_ls_rqst { + struct spdk_nvmf_fc_buffer_desc rqstbuf; + struct spdk_nvmf_fc_buffer_desc rspbuf; + uint32_t rqst_len; + uint32_t rsp_len; + uint32_t rpi; + struct spdk_nvmf_fc_xri *xri; + uint16_t oxid; + void *private_data; /* for LLD only (LS does not touch) */ + TAILQ_ENTRY(spdk_nvmf_fc_ls_rqst) ls_pending_link; + uint32_t s_id; + uint32_t d_id; + struct spdk_nvmf_fc_nport *nport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_tgt *nvmf_tgt; +}; + +/* + * RQ Buffer LS Overlay Structure + */ +#define FCNVME_LS_RSVD_SIZE (FCNVME_MAX_LS_BUFFER_SIZE - \ + (sizeof(struct spdk_nvmf_fc_ls_rqst) + FCNVME_MAX_LS_REQ_SIZE + FCNVME_MAX_LS_RSP_SIZE)) + +struct __attribute__((__packed__)) spdk_nvmf_fc_rq_buf_ls_request { + uint8_t rqst[FCNVME_MAX_LS_REQ_SIZE]; + uint8_t resp[FCNVME_MAX_LS_RSP_SIZE]; + struct spdk_nvmf_fc_ls_rqst ls_rqst; + uint8_t rsvd[FCNVME_LS_RSVD_SIZE]; +}; + +SPDK_STATIC_ASSERT(sizeof(struct spdk_nvmf_fc_rq_buf_ls_request) == + FCNVME_MAX_LS_BUFFER_SIZE, "LS RQ Buffer overflow"); + + +struct spdk_nvmf_fc_poller_api_queue_sync_args { + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + + /* Used internally by poller */ + TAILQ_ENTRY(spdk_nvmf_fc_poller_api_queue_sync_args) link; +}; + +/* + * dump info + */ +struct spdk_nvmf_fc_queue_dump_info { + char *buffer; + int offset; +}; +#define SPDK_FC_HW_DUMP_BUF_SIZE (10 * 4096) + +static inline void +spdk_nvmf_fc_dump_buf_print(struct spdk_nvmf_fc_queue_dump_info *dump_info, char *fmt, ...) +{ + uint64_t buffer_size = SPDK_FC_HW_DUMP_BUF_SIZE; + int32_t avail = (int32_t)(buffer_size - dump_info->offset); + + if (avail > 0) { + va_list ap; + int32_t written; + + va_start(ap, fmt); + written = vsnprintf(dump_info->buffer + dump_info->offset, avail, fmt, ap); + if (written >= avail) { + dump_info->offset += avail; + } else { + dump_info->offset += written; + } + va_end(ap); + } +} + +/* + * NVMF FC caller callback definitions + */ +typedef void (*spdk_nvmf_fc_caller_cb)(void *hwqp, int32_t status, void *args); + +struct spdk_nvmf_fc_caller_ctx { + void *ctx; + spdk_nvmf_fc_caller_cb cb; + void *cb_args; + TAILQ_ENTRY(spdk_nvmf_fc_caller_ctx) link; +}; + +/* + * Low level FC driver function table (functions provided by vendor FC device driver) + */ +struct spdk_nvmf_fc_ll_drvr_ops { + + /* initialize the low level driver */ + int (*lld_init)(void); + + /* low level driver finish */ + void (*lld_fini)(void); + + /* initialize hw queues */ + int (*init_q)(struct spdk_nvmf_fc_hwqp *hwqp); + + void (*reinit_q)(spdk_nvmf_fc_lld_hwqp_t queues_prev, + spdk_nvmf_fc_lld_hwqp_t queues_curr); + + /* initialize hw queue buffers */ + int (*init_q_buffers)(struct spdk_nvmf_fc_hwqp *hwqp); + + /* poll the hw queues for requests */ + uint32_t (*poll_queue)(struct spdk_nvmf_fc_hwqp *hwqp); + + /* receive data (for data-in requests) */ + int (*recv_data)(struct spdk_nvmf_fc_request *fc_req); + + /* send data (for data-out requests) */ + int (*send_data)(struct spdk_nvmf_fc_request *fc_req); + + /* release hw queust buffer */ + void (*q_buffer_release)(struct spdk_nvmf_fc_hwqp *hwqp, uint16_t buff_idx); + + /* transmist nvme response */ + int (*xmt_rsp)(struct spdk_nvmf_fc_request *fc_req, uint8_t *ersp_buf, uint32_t ersp_len); + + /* transmist LS response */ + int (*xmt_ls_rsp)(struct spdk_nvmf_fc_nport *tgtport, struct spdk_nvmf_fc_ls_rqst *ls_rqst); + + /* issue abts */ + int (*issue_abort)(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_xri *xri, + bool send_abts, spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* transmit abts response */ + int (*xmt_bls_rsp)(struct spdk_nvmf_fc_hwqp *hwqp, uint16_t ox_id, uint16_t rx_id, uint16_t rpi, + bool rjt, uint8_t rjt_exp, spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* transmit single request - single response */ + int (*xmt_srsr_req)(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_send_srsr *srsr, + spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* issue queue marker (abts processing) */ + int (*issue_q_marker)(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t u_id, uint16_t skip_rq); + + /* assign a new connection to a hwqp (return connection ID) */ + struct spdk_nvmf_fc_hwqp *(*assign_conn_to_hwqp)( + struct spdk_nvmf_fc_hwqp *queues, uint32_t num_queues, + uint64_t *conn_id, uint32_t sq_size, bool for_aq); + + /* get the hwqp from the given connection id */ + struct spdk_nvmf_fc_hwqp *(*get_hwqp_from_conn_id)(struct spdk_nvmf_fc_hwqp *hwqp, + uint32_t num_queues, uint64_t conn_id); + + /* release connection ID (done with using it) */ + void (*release_conn)(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t conn_id, uint32_t sq_size); + + /* dump all queue info into dump_info */ + void (*dump_all_queues)(struct spdk_nvmf_fc_hwqp *ls_queues, + struct spdk_nvmf_fc_hwqp *io_queues, + uint32_t num_queues, + struct spdk_nvmf_fc_queue_dump_info *dump_info); +}; + +extern struct spdk_nvmf_fc_ll_drvr_ops spdk_nvmf_fc_lld_ops; + +/* + * NVMF FC inline and function prototypes + */ + +static inline struct spdk_nvmf_fc_request * +spdk_nvmf_fc_get_fc_req(struct spdk_nvmf_request *req) +{ + return (struct spdk_nvmf_fc_request *) + ((uintptr_t)req - offsetof(struct spdk_nvmf_fc_request, req)); +} + +static inline bool +spdk_nvmf_fc_is_port_dead(struct spdk_nvmf_fc_hwqp *hwqp) +{ + switch (hwqp->fc_port->hw_port_status) { + case SPDK_FC_PORT_QUIESCED: + return true; + default: + return false; + } +} + +static inline bool +spdk_nvmf_fc_req_in_xfer(struct spdk_nvmf_fc_request *fc_req) +{ + switch (fc_req->state) { + case SPDK_NVMF_FC_REQ_READ_XFER: + case SPDK_NVMF_FC_REQ_READ_RSP: + case SPDK_NVMF_FC_REQ_WRITE_XFER: + case SPDK_NVMF_FC_REQ_WRITE_RSP: + case SPDK_NVMF_FC_REQ_NONE_RSP: + return true; + default: + return false; + } +} + +typedef void (*spdk_nvmf_fc_del_assoc_cb)(void *arg, uint32_t err); +int spdk_nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport, + uint64_t assoc_id, bool send_abts, + spdk_nvmf_fc_del_assoc_cb del_assoc_cb, + void *cb_data); + +void spdk_nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port); + +void spdk_nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port); + +struct spdk_nvmf_fc_port *spdk_nvmf_fc_port_list_get(uint8_t port_hdl); + +int spdk_nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport, + enum spdk_nvmf_fc_object_state state); + +int spdk_nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc, + enum spdk_nvmf_fc_object_state state); + +bool spdk_nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +bool spdk_nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +void spdk_nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, + void *queues_curr); + +void spdk_nvmf_fc_init_poller(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_add_hwqp_to_poller(struct spdk_nvmf_fc_hwqp *hwqp, bool admin_q); + +void spdk_nvmf_fc_remove_hwqp_from_poller(struct spdk_nvmf_fc_hwqp *hwqp); + +bool spdk_nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port); + +bool spdk_nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_hwqp_port_set_online(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_hwqp_port_set_offline(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport, + enum spdk_nvmf_fc_object_state state); + +void spdk_nvmf_fc_port_list_add(struct spdk_nvmf_fc_port *fc_port); + +struct spdk_nvmf_fc_nport *spdk_nvmf_fc_nport_get(uint8_t port_hdl, uint16_t nport_hdl); + +int spdk_nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +uint32_t spdk_nvmf_fc_nport_get_association_count(struct spdk_nvmf_fc_nport *nport); + +int spdk_nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +uint32_t spdk_nvmf_fc_get_prli_service_params(void); + +bool spdk_nvmf_fc_nport_is_rport_empty(struct spdk_nvmf_fc_nport *nport); + +void spdk_nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, + uint16_t rpi, uint16_t oxid, + uint16_t rxid); + +void spdk_nvmf_fc_dump_all_queues(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_queue_dump_info *dump_info); + +void spdk_nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst); + +int spdk_nvmf_fc_xmt_ls_rsp(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst); + +struct spdk_nvmf_fc_nport *spdk_nvmf_bcm_req_fc_nport_get(struct spdk_nvmf_request *req); + +struct spdk_nvmf_fc_association *spdk_nvmf_fc_get_ctrlr_assoc(struct spdk_nvmf_ctrlr *ctrlr); + +bool spdk_nvmf_fc_nport_is_association_empty(struct spdk_nvmf_fc_nport *nport); + +int spdk_nvmf_fc_xmt_srsr_req(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_send_srsr *srsr, + spdk_nvmf_fc_caller_cb cb, void *cb_args); + +uint32_t spdk_nvmf_fc_get_num_nport_ctrlrs_in_subsystem(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_subsystem *subsys); + +bool spdk_nvmf_fc_is_spdk_ctrlr_on_nport(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_ctrlr *ctrlr); + +int spdk_nvmf_fc_get_ctrlr_init_traddr(char *traddr, struct spdk_nvmf_ctrlr *ctrlr); + +uint32_t spdk_nvmf_fc_get_hwqp_id(struct spdk_nvmf_request *req); + +void spdk_nvmf_fc_req_abort(struct spdk_nvmf_fc_request *fc_req, + bool send_abts, spdk_nvmf_fc_caller_cb cb, + void *cb_args); + +int spdk_nvmf_fc_add_port_listen(void *arg1, void *arg2); + +int spdk_nvmf_fc_remove_port_listen(void *arg1, void *arg2); + +void spdk_nvmf_fc_subsys_connect_cb(void *cb_ctx, + struct spdk_nvmf_request *req); + +void spdk_nvmf_fc_subsys_disconnect_cb(void *cb_ctx, + struct spdk_nvmf_qpair *qpair); + +uint32_t spdk_nvmf_fc_get_master_lcore(void); + +struct spdk_thread *spdk_nvmf_fc_get_master_thread(void); + +/* + * These functions are used by low level FC driver + */ + +static inline struct spdk_nvmf_fc_conn * +spdk_nvmf_fc_get_conn(struct spdk_nvmf_qpair *qpair) +{ + return (struct spdk_nvmf_fc_conn *) + ((uintptr_t)qpair - offsetof(struct spdk_nvmf_fc_conn, qpair)); +} + +static inline uint16_t +spdk_nvmf_fc_advance_conn_sqhead(struct spdk_nvmf_qpair *qpair) +{ + /* advance sq_head pointer - wrap if needed */ + qpair->sq_head = (qpair->sq_head == qpair->sq_head_max) ? + 0 : (qpair->sq_head + 1); + return qpair->sq_head; +} + +static inline bool +spdk_nvmf_fc_use_send_frame(struct spdk_nvmf_request *req) +{ + /* For now use for only keepalives. */ + if (req->qpair->qid == 0 && + (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_KEEP_ALIVE)) { + return true; + } + return false; +} + +enum spdk_nvmf_fc_poller_api_ret spdk_nvmf_fc_poller_api_func( + struct spdk_nvmf_fc_hwqp *hwqp, + enum spdk_nvmf_fc_poller_api api, + void *api_args); + +int spdk_nvmf_fc_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, uint32_t buff_idx, + struct spdk_nvmf_fc_frame_hdr *frame, + struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen); + +void spdk_nvmf_fc_process_pending_req(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_process_pending_ls_rqst(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_req_set_state(struct spdk_nvmf_fc_request *fc_req, + enum spdk_nvmf_fc_request_state state); + +void spdk_nvmf_fc_free_req(struct spdk_nvmf_fc_request *fc_req); + +void spdk_nvmf_fc_req_abort_complete(void *arg1); + +bool spdk_nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req, + uint32_t rsp_cnt, uint32_t xfer_len); + +struct spdk_nvmf_fc_xri *spdk_nvmf_fc_get_xri(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_put_xri(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_xri *xri); + +void spdk_nvmf_fc_release_xri(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_xri *xri, bool xb, bool abts); + +int spdk_nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *req); +#endif diff --git a/src/spdk/lib/nvmf/nvmf_internal.h b/src/spdk/lib/nvmf/nvmf_internal.h new file mode 100644 index 00000000..c9c7bf36 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_internal.h @@ -0,0 +1,333 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_INTERNAL_H__ +#define __NVMF_INTERNAL_H__ + +#include "spdk/stdinc.h" + +#include "spdk/likely.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_spec.h" +#include "spdk/assert.h" +#include "spdk/bdev.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/thread.h" + +#define SPDK_NVMF_MAX_SGL_ENTRIES 16 + +enum spdk_nvmf_subsystem_state { + SPDK_NVMF_SUBSYSTEM_INACTIVE = 0, + SPDK_NVMF_SUBSYSTEM_ACTIVATING, + SPDK_NVMF_SUBSYSTEM_ACTIVE, + SPDK_NVMF_SUBSYSTEM_PAUSING, + SPDK_NVMF_SUBSYSTEM_PAUSED, + SPDK_NVMF_SUBSYSTEM_RESUMING, + SPDK_NVMF_SUBSYSTEM_DEACTIVATING, +}; + +enum spdk_nvmf_qpair_state { + SPDK_NVMF_QPAIR_UNINITIALIZED = 0, + SPDK_NVMF_QPAIR_INACTIVE, + SPDK_NVMF_QPAIR_ACTIVATING, + SPDK_NVMF_QPAIR_ACTIVE, + SPDK_NVMF_QPAIR_DEACTIVATING, + SPDK_NVMF_QPAIR_ERROR, +}; + +typedef void (*spdk_nvmf_state_change_done)(void *cb_arg, int status); + +struct spdk_nvmf_tgt { + struct spdk_nvmf_tgt_opts opts; + + uint64_t discovery_genctr; + + /* Array of subsystem pointers of size max_subsystems indexed by sid */ + struct spdk_nvmf_subsystem **subsystems; + + struct spdk_nvmf_discovery_log_page *discovery_log_page; + size_t discovery_log_page_size; + TAILQ_HEAD(, spdk_nvmf_transport) transports; + + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; +}; + +struct spdk_nvmf_host { + char *nqn; + TAILQ_ENTRY(spdk_nvmf_host) link; +}; + +struct spdk_nvmf_listener { + struct spdk_nvme_transport_id trid; + struct spdk_nvmf_transport *transport; + TAILQ_ENTRY(spdk_nvmf_listener) link; +}; + +struct spdk_nvmf_transport_poll_group { + struct spdk_nvmf_transport *transport; + TAILQ_ENTRY(spdk_nvmf_transport_poll_group) link; +}; + +struct spdk_nvmf_subsystem_poll_group { + /* Array of channels for each namespace indexed by nsid - 1 */ + struct spdk_io_channel **channels; + uint32_t num_channels; + + enum spdk_nvmf_subsystem_state state; + + TAILQ_HEAD(, spdk_nvmf_request) queued; +}; + +struct spdk_nvmf_poll_group { + struct spdk_thread *thread; + struct spdk_poller *poller; + + TAILQ_HEAD(, spdk_nvmf_transport_poll_group) tgroups; + + /* Array of poll groups indexed by subsystem id (sid) */ + struct spdk_nvmf_subsystem_poll_group *sgroups; + uint32_t num_sgroups; + + /* All of the queue pairs that belong to this poll group */ + TAILQ_HEAD(, spdk_nvmf_qpair) qpairs; +}; + +typedef enum _spdk_nvmf_request_exec_status { + SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE, + SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS, +} spdk_nvmf_request_exec_status; + +union nvmf_h2c_msg { + struct spdk_nvmf_capsule_cmd nvmf_cmd; + struct spdk_nvme_cmd nvme_cmd; + struct spdk_nvmf_fabric_prop_set_cmd prop_set_cmd; + struct spdk_nvmf_fabric_prop_get_cmd prop_get_cmd; + struct spdk_nvmf_fabric_connect_cmd connect_cmd; +}; +SPDK_STATIC_ASSERT(sizeof(union nvmf_h2c_msg) == 64, "Incorrect size"); + +union nvmf_c2h_msg { + struct spdk_nvme_cpl nvme_cpl; + struct spdk_nvmf_fabric_prop_get_rsp prop_get_rsp; + struct spdk_nvmf_fabric_connect_rsp connect_rsp; +}; +SPDK_STATIC_ASSERT(sizeof(union nvmf_c2h_msg) == 16, "Incorrect size"); + +struct spdk_nvmf_request { + struct spdk_nvmf_qpair *qpair; + uint32_t length; + enum spdk_nvme_data_transfer xfer; + void *data; + union nvmf_h2c_msg *cmd; + union nvmf_c2h_msg *rsp; + struct iovec iov[SPDK_NVMF_MAX_SGL_ENTRIES]; + uint32_t iovcnt; + struct spdk_bdev_io_wait_entry bdev_io_wait; + + TAILQ_ENTRY(spdk_nvmf_request) link; +}; + +struct spdk_nvmf_ns { + struct spdk_nvmf_subsystem *subsystem; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_nvmf_ns_opts opts; +}; + +struct spdk_nvmf_qpair { + enum spdk_nvmf_qpair_state state; + spdk_nvmf_state_change_done state_cb; + void *state_cb_arg; + + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_poll_group *group; + + uint16_t qid; + uint16_t sq_head; + uint16_t sq_head_max; + + TAILQ_HEAD(, spdk_nvmf_request) outstanding; + TAILQ_ENTRY(spdk_nvmf_qpair) link; +}; + +struct spdk_nvmf_ctrlr_feat { + union spdk_nvme_feat_arbitration arbitration; + union spdk_nvme_feat_power_management power_management; + union spdk_nvme_feat_error_recovery error_recovery; + union spdk_nvme_feat_volatile_write_cache volatile_write_cache; + union spdk_nvme_feat_number_of_queues number_of_queues; + union spdk_nvme_feat_write_atomicity write_atomicity; + union spdk_nvme_feat_async_event_configuration async_event_configuration; + union spdk_nvme_feat_keep_alive_timer keep_alive_timer; +}; + +/* + * This structure represents an NVMe-oF controller, + * which is like a "session" in networking terms. + */ +struct spdk_nvmf_ctrlr { + uint16_t cntlid; + struct spdk_nvmf_subsystem *subsys; + + struct { + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + } vcprop; /* virtual controller properties */ + + struct spdk_nvmf_ctrlr_feat feat; + + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_thread *thread; + struct spdk_bit_array *qpair_mask; + + struct spdk_nvmf_request *aer_req; + union spdk_nvme_async_event_completion notice_event; + uint8_t hostid[16]; + + uint16_t changed_ns_list_count; + struct spdk_nvme_ns_list changed_ns_list; + + TAILQ_ENTRY(spdk_nvmf_ctrlr) link; +}; + +struct spdk_nvmf_subsystem { + struct spdk_thread *thread; + uint32_t id; + enum spdk_nvmf_subsystem_state state; + + char subnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; + enum spdk_nvmf_subtype subtype; + uint16_t next_cntlid; + bool allow_any_host; + + struct spdk_nvmf_tgt *tgt; + + char sn[SPDK_NVME_CTRLR_SN_LEN + 1]; + + /* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */ + struct spdk_nvmf_ns **ns; + uint32_t max_nsid; + /* This is the maximum allowed nsid to a subsystem */ + uint32_t max_allowed_nsid; + + TAILQ_HEAD(, spdk_nvmf_ctrlr) ctrlrs; + + TAILQ_HEAD(, spdk_nvmf_host) hosts; + + TAILQ_HEAD(, spdk_nvmf_listener) listeners; + + TAILQ_ENTRY(spdk_nvmf_subsystem) entries; +}; + +typedef void(*spdk_nvmf_poll_group_mod_done)(void *cb_arg, int status); + +struct spdk_nvmf_transport *spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, + enum spdk_nvme_transport_type); + +int spdk_nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport); +int spdk_nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem); +int spdk_nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_request_exec(struct spdk_nvmf_request *req); +int spdk_nvmf_request_free(struct spdk_nvmf_request *req); +int spdk_nvmf_request_complete(struct spdk_nvmf_request *req); + +void spdk_nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, + void *buffer, uint64_t offset, + uint32_t length); + +void spdk_nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr); +int spdk_nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req); +int spdk_nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req); +int spdk_nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req); +bool spdk_nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr); +bool spdk_nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr); +void spdk_nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid); + +void spdk_nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata); + +int spdk_nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +void spdk_nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +struct spdk_nvmf_ctrlr *spdk_nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, + uint16_t cntlid); +int spdk_nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr); + +/* + * Abort aer is sent on a per controller basis and sends a completion for the aer to the host. + * This function should be called when attempting to recover in error paths when it is OK for + * the host to send a subsequent AER. + */ +void spdk_nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr); + +/* + * Free aer simply frees the rdma resources for the aer without informing the host. + * This function should be called when deleting a qpair when one wants to make sure + * the qpair is completely empty before freeing the request. The reason we free the + * AER without sending a completion is to prevent the host from sending another AER. + */ +void spdk_nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair); + +static inline struct spdk_nvmf_ns * +_spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + /* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */ + if (spdk_unlikely(nsid - 1 >= subsystem->max_nsid)) { + return NULL; + } + + return subsystem->ns[nsid - 1]; +} + +static inline bool +spdk_nvmf_qpair_is_admin_queue(struct spdk_nvmf_qpair *qpair) +{ + return qpair->qid == 0; +} + +#endif /* __NVMF_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvmf/rdma.c b/src/spdk/lib/nvmf/rdma.c new file mode 100644 index 00000000..333e703f --- /dev/null +++ b/src/spdk/lib/nvmf/rdma.c @@ -0,0 +1,2930 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include +#include +#include + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/config.h" +#include "spdk/assert.h" +#include "spdk/thread.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +/* + RDMA Connection Resource Defaults + */ +#define NVMF_DEFAULT_TX_SGE 1 +#define NVMF_DEFAULT_RX_SGE 2 +#define NVMF_DEFAULT_DATA_SGE 16 + +/* The RDMA completion queue size */ +#define NVMF_RDMA_CQ_SIZE 4096 + +/* AIO backend requires block size aligned data buffers, + * extra 4KiB aligned data buffer should work for most devices. + */ +#define SHIFT_4KB 12 +#define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) +#define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) + +enum spdk_nvmf_rdma_request_state { + /* The request is not currently in use */ + RDMA_REQUEST_STATE_FREE = 0, + + /* Initial state when request first received */ + RDMA_REQUEST_STATE_NEW, + + /* The request is queued until a data buffer is available. */ + RDMA_REQUEST_STATE_NEED_BUFFER, + + /* The request is waiting on RDMA queue depth availability + * to transfer data between the host and the controller. + */ + RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, + + /* The request is currently transferring data from the host to the controller. */ + RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + + /* The request is ready to execute at the block device */ + RDMA_REQUEST_STATE_READY_TO_EXECUTE, + + /* The request is currently executing at the block device */ + RDMA_REQUEST_STATE_EXECUTING, + + /* The request finished executing at the block device */ + RDMA_REQUEST_STATE_EXECUTED, + + /* The request is ready to send a completion */ + RDMA_REQUEST_STATE_READY_TO_COMPLETE, + + /* The request is currently transferring data from the controller to the host. */ + RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + + /* The request currently has an outstanding completion without an + * associated data transfer. + */ + RDMA_REQUEST_STATE_COMPLETING, + + /* The request completed and can be marked free. */ + RDMA_REQUEST_STATE_COMPLETED, + + /* Terminator */ + RDMA_REQUEST_NUM_STATES, +}; + +#define OBJECT_NVMF_RDMA_IO 0x40 + +#define TRACE_GROUP_NVMF_RDMA 0x4 +#define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) +#define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) +#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) +#define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) +#define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) +#define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) +#define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) +#define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) +#define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) +#define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) +#define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) +#define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) +#define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) + +SPDK_TRACE_REGISTER_FN(nvmf_trace) +{ + spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); + spdk_trace_register_description("RDMA_REQ_NEW", "", + TRACE_RDMA_REQUEST_STATE_NEW, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", + TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", + TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", + TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTING", "", + TRACE_RDMA_REQUEST_STATE_EXECUTING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTED", "", + TRACE_RDMA_REQUEST_STATE_EXECUTED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", + TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING_CONTROLLER_TO_HOST", "", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING_INCAPSULE", "", + TRACE_RDMA_REQUEST_STATE_COMPLETING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETED", "", + TRACE_RDMA_REQUEST_STATE_COMPLETED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + + spdk_trace_register_description("RDMA_QP_CREATE", "", TRACE_RDMA_QP_CREATE, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", "", TRACE_RDMA_IBV_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", "", TRACE_RDMA_CM_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_QP_STATE_CHANGE", "", TRACE_RDMA_QP_STATE_CHANGE, + OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); + spdk_trace_register_description("RDMA_QP_DISCONNECT", "", TRACE_RDMA_QP_DISCONNECT, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_QP_DESTROY", "", TRACE_RDMA_QP_DESTROY, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); +} + +/* This structure holds commands as they are received off the wire. + * It must be dynamically paired with a full request object + * (spdk_nvmf_rdma_request) to service a request. It is separate + * from the request because RDMA does not appear to order + * completions, so occasionally we'll get a new incoming + * command when there aren't any free request objects. + */ +struct spdk_nvmf_rdma_recv { + struct ibv_recv_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; + + struct spdk_nvmf_rdma_qpair *qpair; + + /* In-capsule data buffer */ + uint8_t *buf; + + TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; +}; + +struct spdk_nvmf_rdma_request { + struct spdk_nvmf_request req; + bool data_from_pool; + + enum spdk_nvmf_rdma_request_state state; + + struct spdk_nvmf_rdma_recv *recv; + + struct { + struct ibv_send_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; + } rsp; + + struct { + struct ibv_send_wr wr; + struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; + void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; + } data; + + TAILQ_ENTRY(spdk_nvmf_rdma_request) link; + TAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; +}; + +struct spdk_nvmf_rdma_qpair { + struct spdk_nvmf_qpair qpair; + + struct spdk_nvmf_rdma_port *port; + struct spdk_nvmf_rdma_poller *poller; + + struct rdma_cm_id *cm_id; + struct rdma_cm_id *listen_id; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + + /* The maximum number of active RDMA READ and WRITE operations at one time */ + uint16_t max_rw_depth; + + /* Receives that are waiting for a request object */ + TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; + + /* Queues to track the requests in all states */ + TAILQ_HEAD(, spdk_nvmf_rdma_request) state_queue[RDMA_REQUEST_NUM_STATES]; + + /* Number of requests in each state */ + uint32_t state_cntr[RDMA_REQUEST_NUM_STATES]; + + int max_sge; + + /* Array of size "max_queue_depth" containing RDMA requests. */ + struct spdk_nvmf_rdma_request *reqs; + + /* Array of size "max_queue_depth" containing RDMA recvs. */ + struct spdk_nvmf_rdma_recv *recvs; + + /* Array of size "max_queue_depth" containing 64 byte capsules + * used for receive. + */ + union nvmf_h2c_msg *cmds; + struct ibv_mr *cmds_mr; + + /* Array of size "max_queue_depth" containing 16 byte completions + * to be sent back to the user. + */ + union nvmf_c2h_msg *cpls; + struct ibv_mr *cpls_mr; + + /* Array of size "max_queue_depth * InCapsuleDataSize" containing + * buffers to be used for in capsule data. + */ + void *bufs; + struct ibv_mr *bufs_mr; + + TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; + + /* Mgmt channel */ + struct spdk_io_channel *mgmt_channel; + struct spdk_nvmf_rdma_mgmt_channel *ch; + + /* IBV queue pair attributes: they are used to manage + * qp state and recover from errors. + */ + struct ibv_qp_init_attr ibv_init_attr; + struct ibv_qp_attr ibv_attr; + + bool qpair_disconnected; + + /* Reference counter for how many unprocessed messages + * from other threads are currently outstanding. The + * qpair cannot be destroyed until this is 0. This is + * atomically incremented from any thread, but only + * decremented and read from the thread that owns this + * qpair. + */ + uint32_t refcnt; +}; + +struct spdk_nvmf_rdma_poller { + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poll_group *group; + + struct ibv_cq *cq; + + TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; + + TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; +}; + +struct spdk_nvmf_rdma_poll_group { + struct spdk_nvmf_transport_poll_group group; + + TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; +}; + +/* Assuming rdma_cm uses just one protection domain per ibv_context. */ +struct spdk_nvmf_rdma_device { + struct ibv_device_attr attr; + struct ibv_context *context; + + struct spdk_mem_map *map; + struct ibv_pd *pd; + + TAILQ_ENTRY(spdk_nvmf_rdma_device) link; +}; + +struct spdk_nvmf_rdma_port { + struct spdk_nvme_transport_id trid; + struct rdma_cm_id *id; + struct spdk_nvmf_rdma_device *device; + uint32_t ref; + TAILQ_ENTRY(spdk_nvmf_rdma_port) link; +}; + +struct spdk_nvmf_rdma_transport { + struct spdk_nvmf_transport transport; + + struct rdma_event_channel *event_channel; + + struct spdk_mempool *data_buf_pool; + + pthread_mutex_t lock; + + /* fields used to poll RDMA/IB events */ + nfds_t npoll_fds; + struct pollfd *poll_fds; + + TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; + TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; +}; + +struct spdk_nvmf_rdma_mgmt_channel { + /* Requests that are waiting to obtain a data buffer */ + TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; +}; + +static inline void +spdk_nvmf_rdma_qpair_inc_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) +{ + __sync_fetch_and_add(&rqpair->refcnt, 1); +} + +static inline uint32_t +spdk_nvmf_rdma_qpair_dec_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) +{ + uint32_t old_refcnt, new_refcnt; + + do { + old_refcnt = rqpair->refcnt; + assert(old_refcnt > 0); + new_refcnt = old_refcnt - 1; + } while (__sync_bool_compare_and_swap(&rqpair->refcnt, old_refcnt, new_refcnt) == false); + + return new_refcnt; +} + +/* API to IBV QueuePair */ +static const char *str_ibv_qp_state[] = { + "IBV_QPS_RESET", + "IBV_QPS_INIT", + "IBV_QPS_RTR", + "IBV_QPS_RTS", + "IBV_QPS_SQD", + "IBV_QPS_SQE", + "IBV_QPS_ERR" +}; + +static enum ibv_qp_state +spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { + enum ibv_qp_state old_state, new_state; + int rc; + + /* All the attributes needed for recovery */ + static int spdk_nvmf_ibv_attr_mask = + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_SQ_PSN | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_MAX_QP_RD_ATOMIC; + + old_state = rqpair->ibv_attr.qp_state; + rc = ibv_query_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, + spdk_nvmf_ibv_attr_mask, &rqpair->ibv_init_attr); + + if (rc) + { + SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); + assert(false); + } + + new_state = rqpair->ibv_attr.qp_state; + if (old_state != new_state) + { + spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, + (uintptr_t)rqpair->cm_id, new_state); + } + return new_state; +} + +static int +spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, + enum ibv_qp_state new_state) +{ + int rc; + enum ibv_qp_state state; + static int attr_mask_rc[] = { + [IBV_QPS_RESET] = IBV_QP_STATE, + [IBV_QPS_INIT] = (IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS), + [IBV_QPS_RTR] = (IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER), + [IBV_QPS_RTS] = (IBV_QP_STATE | + IBV_QP_SQ_PSN | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_MAX_QP_RD_ATOMIC), + [IBV_QPS_SQD] = IBV_QP_STATE, + [IBV_QPS_SQE] = IBV_QP_STATE, + [IBV_QPS_ERR] = IBV_QP_STATE, + }; + + switch (new_state) { + case IBV_QPS_RESET: + case IBV_QPS_INIT: + case IBV_QPS_RTR: + case IBV_QPS_RTS: + case IBV_QPS_SQD: + case IBV_QPS_SQE: + case IBV_QPS_ERR: + break; + default: + SPDK_ERRLOG("QP#%d: bad state requested: %u\n", + rqpair->qpair.qid, new_state); + return -1; + } + rqpair->ibv_attr.cur_qp_state = rqpair->ibv_attr.qp_state; + rqpair->ibv_attr.qp_state = new_state; + rqpair->ibv_attr.ah_attr.port_num = rqpair->ibv_attr.port_num; + + rc = ibv_modify_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, + attr_mask_rc[new_state]); + + if (rc) { + SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", + rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); + return rc; + } + + state = spdk_nvmf_rdma_update_ibv_state(rqpair); + + if (state != new_state) { + SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", + rqpair->qpair.qid, str_ibv_qp_state[new_state], + str_ibv_qp_state[state]); + return -1; + } + SPDK_NOTICELOG("IBV QP#%u changed to: %s\n", rqpair->qpair.qid, + str_ibv_qp_state[state]); + return 0; +} + +static void +spdk_nvmf_rdma_request_set_state(struct spdk_nvmf_rdma_request *rdma_req, + enum spdk_nvmf_rdma_request_state state) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + qpair = rdma_req->req.qpair; + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + TAILQ_REMOVE(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]--; + + rdma_req->state = state; + + TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]++; +} + +static int +spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; + + TAILQ_INIT(&ch->pending_data_buf_queue); + return 0; +} + +static void +spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; + + if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { + SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); + } +} + +static int +spdk_nvmf_rdma_cur_rw_depth(struct spdk_nvmf_rdma_qpair *rqpair) +{ + return rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER] + + rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST]; +} + +static int +spdk_nvmf_rdma_cur_queue_depth(struct spdk_nvmf_rdma_qpair *rqpair) +{ + return rqpair->max_queue_depth - + rqpair->state_cntr[RDMA_REQUEST_STATE_FREE]; +} + +static void +spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) +{ + spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + if (spdk_nvmf_rdma_cur_queue_depth(rqpair)) { + rqpair->qpair_disconnected = true; + return; + } + + if (rqpair->refcnt > 0) { + return; + } + + if (rqpair->poller) { + TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); + } + + if (rqpair->cmds_mr) { + ibv_dereg_mr(rqpair->cmds_mr); + } + + if (rqpair->cpls_mr) { + ibv_dereg_mr(rqpair->cpls_mr); + } + + if (rqpair->bufs_mr) { + ibv_dereg_mr(rqpair->bufs_mr); + } + + if (rqpair->cm_id) { + rdma_destroy_qp(rqpair->cm_id); + rdma_destroy_id(rqpair->cm_id); + } + + if (rqpair->mgmt_channel) { + spdk_put_io_channel(rqpair->mgmt_channel); + } + + /* Free all memory */ + spdk_dma_free(rqpair->cmds); + spdk_dma_free(rqpair->cpls); + spdk_dma_free(rqpair->bufs); + free(rqpair->reqs); + free(rqpair->recvs); + free(rqpair); +} + +static int +spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_qpair *rqpair; + int rc, i; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_transport *transport; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + transport = &rtransport->transport; + + memset(&rqpair->ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); + rqpair->ibv_init_attr.qp_context = rqpair; + rqpair->ibv_init_attr.qp_type = IBV_QPT_RC; + rqpair->ibv_init_attr.send_cq = rqpair->poller->cq; + rqpair->ibv_init_attr.recv_cq = rqpair->poller->cq; + rqpair->ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * + 2; /* SEND, READ, and WRITE operations */ + rqpair->ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ + rqpair->ibv_init_attr.cap.max_send_sge = rqpair->max_sge; + rqpair->ibv_init_attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; + + rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &rqpair->ibv_init_attr); + if (rc) { + SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); + + rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); + rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); + rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), + 0x1000, NULL); + rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), + 0x1000, NULL); + + + if (transport->opts.in_capsule_data_size > 0) { + rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, + 0x1000, NULL); + } + + if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || + !rqpair->cpls || (transport->opts.in_capsule_data_size && !rqpair->bufs)) { + SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, + rqpair->max_queue_depth * sizeof(*rqpair->cmds), + IBV_ACCESS_LOCAL_WRITE); + rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, + rqpair->max_queue_depth * sizeof(*rqpair->cpls), + 0); + + if (transport->opts.in_capsule_data_size) { + rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, + rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + } + + if (!rqpair->cmds_mr || !rqpair->cpls_mr || (transport->opts.in_capsule_data_size && + !rqpair->bufs_mr)) { + SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", + rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", + rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); + if (rqpair->bufs && rqpair->bufs_mr) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", + rqpair->bufs, rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, rqpair->bufs_mr->lkey); + } + + /* Initialise request state queues and counters of the queue pair */ + for (i = RDMA_REQUEST_STATE_FREE; i < RDMA_REQUEST_NUM_STATES; i++) { + TAILQ_INIT(&rqpair->state_queue[i]); + rqpair->state_cntr[i] = 0; + } + + for (i = 0; i < rqpair->max_queue_depth; i++) { + struct ibv_recv_wr *bad_wr = NULL; + + rdma_recv = &rqpair->recvs[i]; + rdma_recv->qpair = rqpair; + + /* Set up memory to receive commands */ + if (rqpair->bufs) { + rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * + transport->opts.in_capsule_data_size)); + } + + rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; + rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); + rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; + rdma_recv->wr.num_sge = 1; + + if (rdma_recv->buf && rqpair->bufs_mr) { + rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; + rdma_recv->sgl[1].length = transport->opts.in_capsule_data_size; + rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; + rdma_recv->wr.num_sge++; + } + + rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; + rdma_recv->wr.sg_list = rdma_recv->sgl; + + rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + } + + for (i = 0; i < rqpair->max_queue_depth; i++) { + rdma_req = &rqpair->reqs[i]; + + rdma_req->req.qpair = &rqpair->qpair; + rdma_req->req.cmd = NULL; + + /* Set up memory to send responses */ + rdma_req->req.rsp = &rqpair->cpls[i]; + + rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; + rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); + rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; + + rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; + rdma_req->rsp.wr.next = NULL; + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; + rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); + + /* Set up memory for data buffers */ + rdma_req->data.wr.wr_id = (uint64_t)rdma_req; + rdma_req->data.wr.next = NULL; + rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->data.wr.sg_list = rdma_req->data.sgl; + rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); + + /* Initialize request state to FREE */ + rdma_req->state = RDMA_REQUEST_STATE_FREE; + TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]++; + } + + return 0; +} + +static int +request_transfer_in(struct spdk_nvmf_request *req) +{ + int rc; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_send_wr *bad_wr = NULL; + + qpair = req->qpair; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); + + rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; + rdma_req->data.wr.next = NULL; + rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Unable to transfer data from host to target\n"); + return -1; + } + return 0; +} + +static int +request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) +{ + int rc; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvme_cpl *rsp; + struct ibv_recv_wr *bad_recv_wr = NULL; + struct ibv_send_wr *send_wr, *bad_send_wr = NULL; + + *data_posted = 0; + qpair = req->qpair; + rsp = &req->rsp->nvme_cpl; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* Advance our sq_head pointer */ + if (qpair->sq_head == qpair->sq_head_max) { + qpair->sq_head = 0; + } else { + qpair->sq_head++; + } + rsp->sqhd = qpair->sq_head; + + /* Post the capsule to the recv buffer */ + assert(rdma_req->recv != NULL); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, + rqpair); + rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); + if (rc) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + return rc; + } + rdma_req->recv = NULL; + + /* Build the response which consists of an optional + * RDMA WRITE to transfer data, plus an RDMA SEND + * containing the response. + */ + send_wr = &rdma_req->rsp.wr; + + if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && + req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); + + rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; + + rdma_req->data.wr.next = send_wr; + *data_posted = 1; + send_wr = &rdma_req->data.wr; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); + + /* Send the completion */ + rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); + if (rc) { + SPDK_ERRLOG("Unable to send response capsule\n"); + } + + return rc; +} + +static int +spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_accept_private_data accept_data; + struct rdma_conn_param ctrlr_event_data = {}; + int rc; + + accept_data.recfmt = 0; + accept_data.crqsize = rqpair->max_queue_depth; + + ctrlr_event_data.private_data = &accept_data; + ctrlr_event_data.private_data_len = sizeof(accept_data); + if (id->ps == RDMA_PS_TCP) { + ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ + ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; + } + + rc = rdma_accept(id, &ctrlr_event_data); + if (rc) { + SPDK_ERRLOG("Error %d on rdma_accept\n", errno); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); + } + + return rc; +} + +static void +spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) +{ + struct spdk_nvmf_rdma_reject_private_data rej_data; + + rej_data.recfmt = 0; + rej_data.sts = error; + + rdma_reject(id, &rej_data, sizeof(rej_data)); +} + +static int +nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, + new_qpair_fn cb_fn) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_qpair *rqpair = NULL; + struct spdk_nvmf_rdma_port *port; + struct rdma_conn_param *rdma_param = NULL; + const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; + uint16_t max_queue_depth; + uint16_t max_rw_depth; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ + assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ + + rdma_param = &event->param.conn; + if (rdma_param->private_data == NULL || + rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_ERRLOG("connect request: no private data provided\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); + return -1; + } + + private_data = rdma_param->private_data; + if (private_data->recfmt != 0) { + SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", + event->id->verbs->device->name, event->id->verbs->device->dev_name); + + port = event->listen_id->context; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", + event->listen_id, event->listen_id->verbs, port); + + /* Figure out the supported queue depth. This is a multi-step process + * that takes into account hardware maximums, host provided values, + * and our target's internal memory limits */ + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); + + /* Start with the maximum queue depth allowed by the target */ + max_queue_depth = rtransport->transport.opts.max_queue_depth; + max_rw_depth = rtransport->transport.opts.max_queue_depth; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", + rtransport->transport.opts.max_queue_depth); + + /* Next check the local NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", + port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); + max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); + max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); + + /* Next check the remote NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", + rdma_param->initiator_depth, rdma_param->responder_resources); + if (rdma_param->initiator_depth > 0) { + max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); + } + + /* Finally check for the host software requested values, which are + * optional. */ + if (rdma_param->private_data != NULL && + rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", + max_queue_depth, max_rw_depth); + + rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); + if (rqpair == NULL) { + SPDK_ERRLOG("Could not allocate new connection.\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + return -1; + } + + rqpair->port = port; + rqpair->max_queue_depth = max_queue_depth; + rqpair->max_rw_depth = max_rw_depth; + rqpair->cm_id = event->id; + rqpair->listen_id = event->listen_id; + rqpair->qpair.transport = transport; + rqpair->max_sge = spdk_min(port->device->attr.max_sge, SPDK_NVMF_MAX_SGL_ENTRIES); + TAILQ_INIT(&rqpair->incoming_queue); + event->id->context = &rqpair->qpair; + + cb_fn(&rqpair->qpair); + + return 0; +} + +static void +_nvmf_rdma_disconnect(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); +} + +static void +_nvmf_rdma_disconnect_retry(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + struct spdk_nvmf_poll_group *group; + + /* Read the group out of the qpair. This is normally set and accessed only from + * the thread that created the group. Here, we're not on that thread necessarily. + * The data member qpair->group begins it's life as NULL and then is assigned to + * a pointer and never changes. So fortunately reading this and checking for + * non-NULL is thread safe in the x86_64 memory model. */ + group = qpair->group; + + if (group == NULL) { + /* The qpair hasn't been assigned to a group yet, so we can't + * process a disconnect. Send a message to ourself and try again. */ + spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_disconnect_retry, qpair); + return; + } + + spdk_thread_send_msg(group->thread, _nvmf_rdma_disconnect, qpair); +} + +static int +nvmf_rdma_disconnect(struct rdma_cm_event *evt) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + if (evt->id == NULL) { + SPDK_ERRLOG("disconnect request: missing cm_id\n"); + return -1; + } + + qpair = evt->id->context; + if (qpair == NULL) { + SPDK_ERRLOG("disconnect request: no active connection\n"); + return -1; + } + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + + _nvmf_rdma_disconnect_retry(qpair); + + return 0; +} + +#ifdef DEBUG +static const char *CM_EVENT_STR[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; +#endif /* DEBUG */ + +static void +spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct rdma_cm_event *event; + int rc; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + if (rtransport->event_channel == NULL) { + return; + } + + while (1) { + rc = rdma_get_cm_event(rtransport->event_channel, &event); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); + + spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + /* No action required. The target never attempts to resolve routes. */ + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + rc = nvmf_rdma_connect(transport, event, cb_fn); + if (rc < 0) { + SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + /* The target never initiates a new connection. So this will not occur. */ + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + /* Can this happen? The docs say it can, but not sure what causes it. */ + break; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + /* These only occur on the client side. */ + break; + case RDMA_CM_EVENT_ESTABLISHED: + /* TODO: Should we be waiting for this event anywhere? */ + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + rc = nvmf_rdma_disconnect(event); + if (rc < 0) { + SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + /* Multicast is not used */ + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + /* Not utilizing this event */ + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* For now, do nothing. The target never re-uses queue pairs. */ + break; + default: + SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); + break; + } + + rdma_ack_cm_event(event); + } else { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); + } + break; + } + } +} + +static int +spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct spdk_nvmf_rdma_device *device = cb_ctx; + struct ibv_pd *pd = device->pd; + struct ibv_mr *mr; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -1; + } else { + spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + if (mr) { + ibv_dereg_mr(mr); + } + break; + } + + return 0; +} + +typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; + +static spdk_nvme_data_transfer_t +spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) +{ + enum spdk_nvme_data_transfer xfer; + struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.imm_data = 0; +#endif + + /* Figure out data transfer direction */ + if (cmd->opc == SPDK_NVME_OPC_FABRIC) { + xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); + } else { + xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); + + /* Some admin commands are special cases */ + if ((rdma_req->req.qpair->qid == 0) && + ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || + (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { + switch (cmd->cdw10 & 0xff) { + case SPDK_NVME_FEAT_LBA_RANGE_TYPE: + case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + break; + default: + xfer = SPDK_NVME_DATA_NONE; + } + } + } + + if (xfer == SPDK_NVME_DATA_NONE) { + return xfer; + } + + /* Even for commands that may transfer data, they could have specified 0 length. + * We want those to show up with xfer SPDK_NVME_DATA_NONE. + */ + switch (sgl->generic.type) { + case SPDK_NVME_SGL_TYPE_DATA_BLOCK: + case SPDK_NVME_SGL_TYPE_BIT_BUCKET: + case SPDK_NVME_SGL_TYPE_SEGMENT: + case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: + case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: + if (sgl->unkeyed.length == 0) { + xfer = SPDK_NVME_DATA_NONE; + } + break; + case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: + if (sgl->keyed.length == 0) { + xfer = SPDK_NVME_DATA_NONE; + } + break; + } + + return xfer; +} + +static int +spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + void *buf = NULL; + uint32_t length = rdma_req->req.length; + uint32_t i = 0; + + rdma_req->req.iovcnt = 0; + while (length) { + buf = spdk_mempool_get(rtransport->data_buf_pool); + if (!buf) { + goto nomem; + } + + rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->transport.opts.io_unit_size); + rdma_req->req.iovcnt++; + rdma_req->data.buffers[i] = buf; + rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); + rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; + rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, + (uint64_t)buf, NULL))->lkey; + + length -= rdma_req->req.iov[i].iov_len; + i++; + } + + rdma_req->data_from_pool = true; + + return 0; + +nomem: + while (i) { + i--; + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->req.iov[i].iov_len = 0; + + rdma_req->data.wr.sg_list[i].addr = 0; + rdma_req->data.wr.sg_list[i].length = 0; + rdma_req->data.wr.sg_list[i].lkey = 0; + } + rdma_req->req.iovcnt = 0; + return -ENOMEM; +} + +static int +spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvme_cmd *cmd; + struct spdk_nvme_cpl *rsp; + struct spdk_nvme_sgl_descriptor *sgl; + + cmd = &rdma_req->req.cmd->nvme_cmd; + rsp = &rdma_req->req.rsp->nvme_cpl; + sgl = &cmd->dptr.sgl1; + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && + (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || + sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { + if (sgl->keyed.length > rtransport->transport.opts.max_io_size) { + SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", + sgl->keyed.length, rtransport->transport.opts.max_io_size); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { + if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { + rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; + rdma_req->rsp.wr.imm_data = sgl->keyed.key; + } + } +#endif + + /* fill request length and populate iovs */ + rdma_req->req.length = sgl->keyed.length; + + if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { + /* No available buffers. Queue this request up. */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); + return 0; + } + + /* backward compatible */ + rdma_req->req.data = rdma_req->req.iov[0].iov_base; + + /* rdma wr specifics */ + rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; + rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; + rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + rdma_req->req.iovcnt); + + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + uint64_t offset = sgl->address; + uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", + offset, sgl->unkeyed.length); + + if (offset > max_len) { + SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", + offset, max_len); + rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; + return -1; + } + max_len -= (uint32_t)offset; + + if (sgl->unkeyed.length > max_len) { + SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", + sgl->unkeyed.length, max_len); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + rdma_req->req.data = rdma_req->recv->buf + offset; + rdma_req->data_from_pool = false; + rdma_req->req.length = sgl->unkeyed.length; + + rdma_req->req.iov[0].iov_base = rdma_req->req.data; + rdma_req->req.iov[0].iov_len = rdma_req->req.length; + rdma_req->req.iovcnt = 1; + + return 0; + } + + SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; + return -1; +} + +static bool +spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; + int rc; + struct spdk_nvmf_rdma_recv *rdma_recv; + enum spdk_nvmf_rdma_request_state prev_state; + bool progress = false; + int data_posted; + int cur_rdma_rw_depth; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + device = rqpair->port->device; + + assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); + + /* If the queue pair is in an error state, force the request to the completed state + * to release resources. */ + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + } + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = rdma_req->state; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); + + switch (rdma_req->state) { + case RDMA_REQUEST_STATE_FREE: + /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_NEW: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rdma_recv = rdma_req->recv; + + /* The first element of the SGL is the NVMe command */ + rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; + memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); + + TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); + + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + break; + } + + /* The next state transition depends on the data transfer needs of this request. */ + rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); + + /* If no data to transfer, ready to execute. */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + break; + } + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEED_BUFFER); + TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + break; + case RDMA_REQUEST_STATE_NEED_BUFFER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); + + if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { + /* This request needs to wait in line to obtain a buffer */ + break; + } + + /* Try to get a data buffer */ + rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); + if (rc < 0) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + break; + } + + if (!rdma_req->req.data) { + /* No buffers available. */ + break; + } + + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + + /* If data is transferring from host to controller and the data didn't + * arrive using in capsule data, we need to do a transfer from the host. + */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + break; + } + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + break; + case RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req != TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING])) { + /* This request needs to wait in line to perform RDMA */ + break; + } + cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); + + if (cur_rdma_rw_depth >= rqpair->max_rw_depth) { + /* R/W queue is full, need to wait */ + break; + } + + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + rc = request_transfer_in(&rdma_req->req); + if (!rc) { + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + } else { + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } + } else if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + /* The data transfer will be kicked off from + * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. + */ + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } else { + SPDK_ERRLOG("Cannot perform data transfer, unknown state: %u\n", + rdma_req->req.xfer); + assert(0); + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_READY_TO_EXECUTE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTING); + spdk_nvmf_request_exec(&rdma_req->req); + break; + case RDMA_REQUEST_STATE_EXECUTING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_EXECUTED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + } else { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } + break; + case RDMA_REQUEST_STATE_READY_TO_COMPLETE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rc = request_transfer_out(&rdma_req->req, &data_posted); + assert(rc == 0); /* No good way to handle this currently */ + if (rc) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } else { + spdk_nvmf_rdma_request_set_state(rdma_req, + data_posted ? + RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : + RDMA_REQUEST_STATE_COMPLETING); + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req->data_from_pool) { + /* Put the buffer/s back in the pool */ + for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->data.buffers[i] = NULL; + } + rdma_req->data_from_pool = false; + } + rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; + rdma_req->req.data = NULL; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); + break; + case RDMA_REQUEST_NUM_STATES: + default: + assert(0); + break; + } + + if (rdma_req->state != prev_state) { + progress = true; + } + } while (rdma_req->state != prev_state); + + return progress; +} + +/* Public API callbacks begin here */ + +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 +#define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE 131072 + +static void +spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; + opts->io_unit_size = SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE; + opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; +} + +static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); + +static struct spdk_nvmf_transport * +spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) +{ + int rc; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + struct ibv_context **contexts; + uint32_t i; + int flag; + uint32_t sge_count; + + const struct spdk_mem_map_ops nvmf_rdma_map_ops = { + .notify_cb = spdk_nvmf_rdma_mem_notify, + .are_contiguous = NULL + }; + + rtransport = calloc(1, sizeof(*rtransport)); + if (!rtransport) { + return NULL; + } + + if (pthread_mutex_init(&rtransport->lock, NULL)) { + SPDK_ERRLOG("pthread_mutex_init() failed\n"); + free(rtransport); + return NULL; + } + + spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, + spdk_nvmf_rdma_mgmt_channel_destroy, + sizeof(struct spdk_nvmf_rdma_mgmt_channel), + "rdma_transport"); + + TAILQ_INIT(&rtransport->devices); + TAILQ_INIT(&rtransport->ports); + + rtransport->transport.ops = &spdk_nvmf_transport_rdma; + + SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" + " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" + " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" + " in_capsule_data_size=%d, max_aq_depth=%d\n", + opts->max_queue_depth, + opts->max_io_size, + opts->max_qpairs_per_ctrlr, + opts->io_unit_size, + opts->in_capsule_data_size, + opts->max_aq_depth); + + /* I/O unit size cannot be larger than max I/O size */ + if (opts->io_unit_size > opts->max_io_size) { + opts->io_unit_size = opts->max_io_size; + } + + sge_count = opts->max_io_size / opts->io_unit_size; + if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->event_channel = rdma_create_event_channel(); + if (rtransport->event_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + flag = fcntl(rtransport->event_channel->fd, F_GETFL); + if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + rtransport->event_channel->fd, spdk_strerror(errno)); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", + opts->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ + opts->max_io_size + NVMF_DATA_BUFFER_ALIGNMENT, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!rtransport->data_buf_pool) { + SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + i = 0; + rc = 0; + while (contexts[i] != NULL) { + device = calloc(1, sizeof(*device)); + if (!device) { + SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); + rc = -ENOMEM; + break; + } + device->context = contexts[i]; + rc = ibv_query_device(device->context, &device->attr); + if (rc < 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + free(device); + break; + + } + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { + SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); + SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); + } + + /** + * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. + * The Soft-RoCE RXE driver does not currently support send with invalidate, + * but incorrectly reports that it does. There are changes making their way + * through the kernel now that will enable this feature. When they are merged, + * we can conditionally enable this feature. + * + * TODO: enable this for versions of the kernel rxe driver that support it. + */ + if (device->attr.vendor_id == 0) { + device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); + } +#endif + + /* set up device context async ev fd as NON_BLOCKING */ + flag = fcntl(device->context->async_fd, F_GETFL); + rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); + if (rc < 0) { + SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); + free(device); + break; + } + + device->pd = ibv_alloc_pd(device->context); + if (!device->pd) { + SPDK_ERRLOG("Unable to allocate protection domain.\n"); + free(device); + rc = -1; + break; + } + + device->map = spdk_mem_map_alloc(0, &nvmf_rdma_map_ops, device); + if (!device->map) { + SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); + ibv_dealloc_pd(device->pd); + free(device); + rc = -1; + break; + } + + TAILQ_INSERT_TAIL(&rtransport->devices, device, link); + i++; + } + rdma_free_devices(contexts); + + if (rc < 0) { + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + /* Set up poll descriptor array to monitor events from RDMA and IB + * in a single poll syscall + */ + rtransport->npoll_fds = i + 1; + i = 0; + rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); + if (rtransport->poll_fds == NULL) { + SPDK_ERRLOG("poll_fds allocation failed\n"); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->poll_fds[i].fd = rtransport->event_channel->fd; + rtransport->poll_fds[i++].events = POLLIN; + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + rtransport->poll_fds[i].fd = device->context->async_fd; + rtransport->poll_fds[i++].events = POLLIN; + } + + return &rtransport->transport; +} + +static int +spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *port_tmp; + struct spdk_nvmf_rdma_device *device, *device_tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + } + + if (rtransport->poll_fds != NULL) { + free(rtransport->poll_fds); + } + + if (rtransport->event_channel != NULL) { + rdma_destroy_event_channel(rtransport->event_channel); + } + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { + TAILQ_REMOVE(&rtransport->devices, device, link); + if (device->map) { + spdk_mem_map_free(&device->map); + } + if (device->pd) { + ibv_dealloc_pd(device->pd); + } + free(device); + } + + if (rtransport->data_buf_pool != NULL) { + if (spdk_mempool_count(rtransport->data_buf_pool) != + (transport->opts.max_queue_depth * 4)) { + SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", + spdk_mempool_count(rtransport->data_buf_pool), + transport->opts.max_queue_depth * 4); + } + } + + spdk_mempool_free(rtransport->data_buf_pool); + spdk_io_device_unregister(rtransport, NULL); + pthread_mutex_destroy(&rtransport->lock); + free(rtransport); + + return 0; +} + +static int +spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_port *port_tmp, *port; + struct addrinfo *res; + struct addrinfo hints; + int family; + int rc; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + port = calloc(1, sizeof(*port)); + if (!port) { + return -ENOMEM; + } + + /* Selectively copy the trid. Things like NQN don't matter here - that + * mapping is enforced elsewhere. + */ + port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; + port->trid.adrfam = trid->adrfam; + snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); + snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); + + pthread_mutex_lock(&rtransport->lock); + assert(rtransport->event_channel != NULL); + TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { + if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { + port_tmp->ref++; + free(port); + /* Already listening at this address */ + pthread_mutex_unlock(&rtransport->lock); + return 0; + } + } + + rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + switch (port->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); + if (rc) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + rc = rdma_bind_addr(port->id, res->ai_addr); + freeaddrinfo(res); + + if (rc < 0) { + SPDK_ERRLOG("rdma_bind_addr() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + if (!port->id->verbs) { + SPDK_ERRLOG("ibv_context is null\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -1; + } + + rc = rdma_listen(port->id, 10); /* 10 = backlog */ + if (rc < 0) { + SPDK_ERRLOG("rdma_listen() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + TAILQ_FOREACH(device, &rtransport->devices, link) { + if (device->context == port->id->verbs) { + port->device = device; + break; + } + } + if (!port->device) { + SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", + port->id->verbs); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", + port->trid.traddr, ntohs(rdma_get_src_port(port->id))); + + port->ref = 1; + + TAILQ_INSERT_TAIL(&rtransport->ports, port, link); + pthread_mutex_unlock(&rtransport->lock); + + return 0; +} + +static int +spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *_trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *tmp; + struct spdk_nvme_transport_id trid = {}; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + /* Selectively copy the trid. Things like NQN don't matter here - that + * mapping is enforced elsewhere. + */ + trid.trtype = SPDK_NVME_TRANSPORT_RDMA; + trid.adrfam = _trid->adrfam; + snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); + snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { + if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { + assert(port->ref > 0); + port->ref--; + if (port->ref == 0) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + } + break; + } + } + + pthread_mutex_unlock(&rtransport->lock); + return 0; +} + +static bool +spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) +{ + int cur_queue_depth, cur_rdma_rw_depth; + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + cur_queue_depth = spdk_nvmf_rdma_cur_queue_depth(rqpair); + cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); + + if (cur_queue_depth == 0 && cur_rdma_rw_depth == 0) { + return true; + } + return false; +} + +static void +spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + + /* We process I/O in the data transfer pending queue at the highest priority. */ + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING], + state_link, req_tmp) { + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } + + /* The second highest priority is I/O waiting on memory buffers. */ + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, + req_tmp) { + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } + + if (rqpair->qpair_disconnected) { + spdk_nvmf_rdma_qpair_destroy(rqpair); + return; + } + + /* Do not process newly received commands if qp is in ERROR state, + * wait till the recovery is complete. + */ + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + return; + } + + /* The lowest priority is processing newly received commands */ + TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { + if (TAILQ_EMPTY(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE])) { + break; + } + + rdma_req = TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE]); + rdma_req->recv = rdma_recv; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEW); + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } +} + +static void +spdk_nvmf_rdma_drain_state_queue(struct spdk_nvmf_rdma_qpair *rqpair, + enum spdk_nvmf_rdma_request_state state) +{ + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + struct spdk_nvmf_rdma_transport *rtransport; + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[state], state_link, req_tmp) { + rtransport = SPDK_CONTAINEROF(rdma_req->req.qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + } +} + +static void +spdk_nvmf_rdma_qpair_recover(struct spdk_nvmf_rdma_qpair *rqpair) +{ + enum ibv_qp_state state, next_state; + int recovered; + struct spdk_nvmf_rdma_transport *rtransport; + + if (!spdk_nvmf_rdma_qpair_is_idle(&rqpair->qpair)) { + /* There must be outstanding requests down to media. + * If so, wait till they're complete. + */ + assert(!TAILQ_EMPTY(&rqpair->qpair.outstanding)); + return; + } + + state = rqpair->ibv_attr.qp_state; + next_state = state; + + SPDK_NOTICELOG("RDMA qpair %u is in state: %s\n", + rqpair->qpair.qid, + str_ibv_qp_state[state]); + + if (!(state == IBV_QPS_ERR || state == IBV_QPS_RESET)) { + SPDK_ERRLOG("Can't recover RDMA qpair %u from the state: %s\n", + rqpair->qpair.qid, + str_ibv_qp_state[state]); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); + return; + } + + recovered = 0; + while (!recovered) { + switch (state) { + case IBV_QPS_ERR: + next_state = IBV_QPS_RESET; + break; + case IBV_QPS_RESET: + next_state = IBV_QPS_INIT; + break; + case IBV_QPS_INIT: + next_state = IBV_QPS_RTR; + break; + case IBV_QPS_RTR: + next_state = IBV_QPS_RTS; + break; + case IBV_QPS_RTS: + recovered = 1; + break; + default: + SPDK_ERRLOG("RDMA qpair %u unexpected state for recovery: %u\n", + rqpair->qpair.qid, state); + goto error; + } + /* Do not transition into same state */ + if (next_state == state) { + break; + } + + if (spdk_nvmf_rdma_set_ibv_state(rqpair, next_state)) { + goto error; + } + + state = next_state; + } + + rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, + transport); + + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + + return; +error: + SPDK_NOTICELOG("RDMA qpair %u: recovery failed, disconnecting...\n", + rqpair->qpair.qid); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); +} + +/* Clean up only the states that can be aborted at any time */ +static void +_spdk_nvmf_rdma_qp_cleanup_safe_states(struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_NEW); + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_NEED_BUFFER], link, req_tmp) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + } + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_NEED_BUFFER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_EXECUTED); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETED); +} + +/* This cleans up all memory. It is only safe to use if the rest of the software stack + * has been shut down */ +static void +_spdk_nvmf_rdma_qp_cleanup_all_states(struct spdk_nvmf_rdma_qpair *rqpair) +{ + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_EXECUTING); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETING); +} + +static void +_spdk_nvmf_rdma_qp_error(void *arg) +{ + struct spdk_nvmf_rdma_qpair *rqpair = arg; + enum ibv_qp_state state; + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + state = rqpair->ibv_attr.qp_state; + if (state != IBV_QPS_ERR) { + /* Error was already recovered */ + return; + } + + if (spdk_nvmf_qpair_is_admin_queue(&rqpair->qpair)) { + spdk_nvmf_ctrlr_abort_aer(rqpair->qpair.ctrlr); + } + + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + /* Attempt recovery. This will exit without recovering if I/O requests + * are still outstanding */ + spdk_nvmf_rdma_qpair_recover(rqpair); +} + +static void +_spdk_nvmf_rdma_qp_last_wqe(void *arg) +{ + struct spdk_nvmf_rdma_qpair *rqpair = arg; + enum ibv_qp_state state; + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + state = rqpair->ibv_attr.qp_state; + if (state != IBV_QPS_ERR) { + /* Error was already recovered */ + return; + } + + /* Clear out the states that are safe to clear any time, plus the + * RDMA data transfer states. */ + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETING); + + spdk_nvmf_rdma_qpair_recover(rqpair); +} + +static void +spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) +{ + int rc; + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_async_event event; + enum ibv_qp_state state; + + rc = ibv_get_async_event(device->context, &event); + + if (rc) { + SPDK_ERRLOG("Failed to get async_event (%d): %s\n", + errno, spdk_strerror(errno)); + return; + } + + SPDK_NOTICELOG("Async event: %s\n", + ibv_event_type_str(event.event_type)); + + switch (event.event_type) { + case IBV_EVENT_QP_FATAL: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_error, rqpair); + break; + case IBV_EVENT_QP_LAST_WQE_REACHED: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_last_wqe, rqpair); + break; + case IBV_EVENT_SQ_DRAINED: + /* This event occurs frequently in both error and non-error states. + * Check if the qpair is in an error state before sending a message. + * Note that we're not on the correct thread to access the qpair, but + * the operations that the below calls make all happen to be thread + * safe. */ + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + state = spdk_nvmf_rdma_update_ibv_state(rqpair); + if (state == IBV_QPS_ERR) { + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_error, rqpair); + } + break; + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + break; + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_GID_CHANGE: + default: + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); + break; + } + ibv_ack_async_event(&event); +} + +static void +spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + int nfds, i = 0; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); + + if (nfds <= 0) { + return; + } + + /* The first poll descriptor is RDMA CM event */ + if (rtransport->poll_fds[i++].revents & POLLIN) { + spdk_nvmf_process_cm_event(transport, cb_fn); + nfds--; + } + + if (nfds == 0) { + return; + } + + /* Second and subsequent poll descriptors are IB async events */ + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + if (rtransport->poll_fds[i++].revents & POLLIN) { + spdk_nvmf_process_ib_event(device); + nfds--; + } + } + /* check all flagged fd's have been served */ + assert(nfds == 0); +} + +static void +spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + entry->trtype = SPDK_NVMF_TRTYPE_RDMA; + entry->adrfam = trid->adrfam; + entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; + + spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); + spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); + + entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; + entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; + entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; +} + +static struct spdk_nvmf_transport_poll_group * +spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *poller; + struct spdk_nvmf_rdma_device *device; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + rgroup = calloc(1, sizeof(*rgroup)); + if (!rgroup) { + return NULL; + } + + TAILQ_INIT(&rgroup->pollers); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH(device, &rtransport->devices, link) { + poller = calloc(1, sizeof(*poller)); + if (!poller) { + SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); + free(rgroup); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + poller->device = device; + poller->group = rgroup; + + TAILQ_INIT(&poller->qpairs); + + poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); + if (!poller->cq) { + SPDK_ERRLOG("Unable to create completion queue\n"); + free(poller); + free(rgroup); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); + } + + pthread_mutex_unlock(&rtransport->lock); + return &rgroup->group; +} + +static void +spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *poller, *tmp; + struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; + + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + + if (!rgroup) { + return; + } + + TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { + TAILQ_REMOVE(&rgroup->pollers, poller, link); + + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { + _spdk_nvmf_rdma_qp_cleanup_all_states(qpair); + spdk_nvmf_rdma_qpair_destroy(qpair); + } + + free(poller); + } + + free(rgroup); +} + +static int +spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poller *poller; + int rc; + + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + device = rqpair->port->device; + + TAILQ_FOREACH(poller, &rgroup->pollers, link) { + if (poller->device == device) { + break; + } + } + + if (!poller) { + SPDK_ERRLOG("No poller found for device.\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); + rqpair->poller = poller; + + rc = spdk_nvmf_rdma_qpair_initialize(qpair); + if (rc < 0) { + SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); + return -1; + } + + rqpair->mgmt_channel = spdk_get_io_channel(rtransport); + if (!rqpair->mgmt_channel) { + spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); + assert(rqpair->ch != NULL); + + rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); + if (rc) { + /* Try to reject, but we probably can't */ + spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + spdk_nvmf_rdma_update_ibv_state(rqpair); + + return 0; +} + +static int +spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + + if (rdma_req->data_from_pool) { + /* Put the buffer/s back in the pool */ + for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->data.buffers[i] = NULL; + } + rdma_req->data_from_pool = false; + } + rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; + rdma_req->req.data = NULL; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); + return 0; +} + +static int +spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, + struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, + struct spdk_nvmf_rdma_qpair, qpair); + + if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { + /* The connection is alive, so process the request as normal */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTED); + } else { + /* The connection is dead. Move the request directly to the completed state. */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } + + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE && rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + /* If the NVMe-oF layer thinks the connection is active, but the RDMA layer thinks + * the connection is dead, perform error recovery. */ + spdk_nvmf_rdma_qpair_recover(rqpair); + } + + return 0; +} + +static void +spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_nvmf_rdma_qpair_destroy(rqpair); +} + +static struct spdk_nvmf_rdma_request * +get_rdma_req_from_wc(struct ibv_wc *wc) +{ + struct spdk_nvmf_rdma_request *rdma_req; + + rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; + assert(rdma_req != NULL); + +#ifdef DEBUG + struct spdk_nvmf_rdma_qpair *rqpair; + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(rdma_req - rqpair->reqs >= 0); + assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth); +#endif + + return rdma_req; +} + +static struct spdk_nvmf_rdma_recv * +get_rdma_recv_from_wc(struct ibv_wc *wc) +{ + struct spdk_nvmf_rdma_recv *rdma_recv; + + assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); + + rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; + assert(rdma_recv != NULL); + +#ifdef DEBUG + struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair; + + assert(rdma_recv - rqpair->recvs >= 0); + assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth); +#endif + + return rdma_recv; +} + +#ifdef DEBUG +static int +spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) +{ + return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || + rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; +} +#endif + +static int +spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller) +{ + struct ibv_wc wc[32]; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_qpair *rqpair; + int reaped, i; + int count = 0; + bool error = false; + + /* Poll for completing operations. */ + reaped = ibv_poll_cq(rpoller->cq, 32, wc); + if (reaped < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -1; + } + + for (i = 0; i < reaped; i++) { + /* Handle error conditions */ + if (wc[i].status) { + SPDK_WARNLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n", + rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + error = true; + + switch (wc[i].opcode) { + case IBV_WC_SEND: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* We're going to attempt an error recovery, so force the request into + * the completed state. */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + break; + case IBV_WC_RECV: + rdma_recv = get_rdma_recv_from_wc(&wc[i]); + rqpair = rdma_recv->qpair; + + /* Dump this into the incoming queue. This gets cleaned up when + * the queue pair disconnects or recovers. */ + TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); + break; + case IBV_WC_RDMA_WRITE: + case IBV_WC_RDMA_READ: + /* If the data transfer fails still force the queue into the error state, + * but the rdma_req objects should only be manipulated in response to + * SEND and RECV operations. */ + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + break; + default: + SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); + continue; + } + + /* Set the qpair to the error state. This will initiate a recovery. */ + spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); + continue; + } + + switch (wc[i].opcode) { + case IBV_WC_SEND: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + count++; + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RDMA_WRITE: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RDMA_READ: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RECV: + rdma_recv = get_rdma_recv_from_wc(&wc[i]); + rqpair = rdma_recv->qpair; + + TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + default: + SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); + continue; + } + } + + if (error == true) { + return -1; + } + + return count; +} + +static int +spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *rpoller; + int count, rc; + + rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + + count = 0; + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); + if (rc < 0) { + return rc; + } + count += rc; + } + + return count; +} + +static int +spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, + struct spdk_nvme_transport_id *trid, + bool peer) +{ + struct sockaddr *saddr; + uint16_t port; + + trid->trtype = SPDK_NVME_TRANSPORT_RDMA; + + if (peer) { + saddr = rdma_get_peer_addr(id); + } else { + saddr = rdma_get_local_addr(id); + } + switch (saddr->sa_family) { + case AF_INET: { + struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; + + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + inet_ntop(AF_INET, &saddr_in->sin_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + case AF_INET6: { + struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; + trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; + inet_ntop(AF_INET6, &saddr_in->sin6_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + default: + return -1; + + } + + return 0; +} + +static int +spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); +} + +static int +spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); +} + +static int +spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); +} + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { + .type = SPDK_NVME_TRANSPORT_RDMA, + .opts_init = spdk_nvmf_rdma_opts_init, + .create = spdk_nvmf_rdma_create, + .destroy = spdk_nvmf_rdma_destroy, + + .listen = spdk_nvmf_rdma_listen, + .stop_listen = spdk_nvmf_rdma_stop_listen, + .accept = spdk_nvmf_rdma_accept, + + .listener_discover = spdk_nvmf_rdma_discover, + + .poll_group_create = spdk_nvmf_rdma_poll_group_create, + .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, + .poll_group_add = spdk_nvmf_rdma_poll_group_add, + .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, + + .req_free = spdk_nvmf_rdma_request_free, + .req_complete = spdk_nvmf_rdma_request_complete, + + .qpair_fini = spdk_nvmf_rdma_close_qpair, + .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, + .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, + .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, + .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, + +}; + +SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) diff --git a/src/spdk/lib/nvmf/request.c b/src/spdk/lib/nvmf/request.c new file mode 100644 index 00000000..88b6b9a9 --- /dev/null +++ b/src/spdk/lib/nvmf/request.c @@ -0,0 +1,190 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/trace.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +static void +spdk_nvmf_qpair_request_cleanup(struct spdk_nvmf_qpair *qpair) +{ + if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING) { + assert(qpair->state_cb != NULL); + + if (TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb(qpair->state_cb_arg, 0); + } + } else { + assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE); + } +} + +int +spdk_nvmf_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (spdk_nvmf_transport_req_free(req)) { + SPDK_ERRLOG("Unable to free transport level request resources.\n"); + } + + spdk_nvmf_qpair_request_cleanup(qpair); + + return 0; +} + +int +spdk_nvmf_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_qpair *qpair; + + rsp->sqid = 0; + rsp->status.p = 0; + rsp->cid = req->cmd->nvme_cmd.cid; + + qpair = req->qpair; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "cpl: cid=%u cdw0=0x%08x rsvd1=%u status=0x%04x\n", + rsp->cid, rsp->cdw0, rsp->rsvd1, + *(uint16_t *)&rsp->status); + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (spdk_nvmf_transport_req_complete(req)) { + SPDK_ERRLOG("Transport request completion error!\n"); + } + + spdk_nvmf_qpair_request_cleanup(qpair); + + return 0; +} + +static void +nvmf_trace_command(union nvmf_h2c_msg *h2c_msg, bool is_admin_queue) +{ + struct spdk_nvmf_capsule_cmd *cap_hdr = &h2c_msg->nvmf_cmd; + struct spdk_nvme_cmd *cmd = &h2c_msg->nvme_cmd; + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + uint8_t opc; + + if (cmd->opc == SPDK_NVME_OPC_FABRIC) { + opc = cap_hdr->fctype; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s Fabrics cmd: fctype 0x%02x cid %u\n", + is_admin_queue ? "Admin" : "I/O", + cap_hdr->fctype, cap_hdr->cid); + } else { + opc = cmd->opc; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s cmd: opc 0x%02x fuse %u cid %u nsid %u cdw10 0x%08x\n", + is_admin_queue ? "Admin" : "I/O", + cmd->opc, cmd->fuse, cmd->cid, cmd->nsid, cmd->cdw10); + if (cmd->mptr) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "mptr 0x%" PRIx64 "\n", cmd->mptr); + } + if (cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_CONTIG && + cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_SGL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "psdt %u\n", cmd->psdt); + } + } + + if (spdk_nvme_opc_get_data_transfer(opc) != SPDK_NVME_DATA_NONE) { + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "SGL: Keyed%s: addr 0x%" PRIx64 " key 0x%x len 0x%x\n", + sgl->generic.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY ? " (Inv)" : "", + sgl->address, sgl->keyed.key, sgl->keyed.length); + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL: Data block: %s 0x%" PRIx64 " len 0x%x\n", + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET ? "offs" : "addr", + sgl->address, sgl->unkeyed.length); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL type 0x%x subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + } + } +} + +void +spdk_nvmf_request_exec(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + spdk_nvmf_request_exec_status status; + + nvmf_trace_command(req->cmd, spdk_nvmf_qpair_is_admin_queue(qpair)); + + if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) { + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + spdk_nvmf_request_complete(req); + return; + } + + /* Check if the subsystem is paused (if there is a subsystem) */ + if (qpair->ctrlr) { + struct spdk_nvmf_subsystem_poll_group *sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) { + /* The subsystem is not currently active. Queue this request. */ + TAILQ_INSERT_TAIL(&sgroup->queued, req, link); + return; + } + + } + + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + + if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) { + status = spdk_nvmf_ctrlr_process_fabrics_cmd(req); + } else if (spdk_unlikely(spdk_nvmf_qpair_is_admin_queue(qpair))) { + status = spdk_nvmf_ctrlr_process_admin_cmd(req); + } else { + status = spdk_nvmf_ctrlr_process_io_cmd(req); + } + + if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + spdk_nvmf_request_complete(req); + } +} diff --git a/src/spdk/lib/nvmf/subsystem.c b/src/spdk/lib/nvmf/subsystem.c new file mode 100644 index 00000000..9e28f3c6 --- /dev/null +++ b/src/spdk/lib/nvmf/subsystem.c @@ -0,0 +1,1269 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/event.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/utf.h" + +/* + * States for parsing valid domains in NQNs according to RFC 1034 + */ +enum spdk_nvmf_nqn_domain_states { + /* First character of a domain must be a letter */ + SPDK_NVMF_DOMAIN_ACCEPT_LETTER = 0, + + /* Subsequent characters can be any of letter, digit, or hyphen */ + SPDK_NVMF_DOMAIN_ACCEPT_LDH = 1, + + /* A domain label must end with either a letter or digit */ + SPDK_NVMF_DOMAIN_ACCEPT_ANY = 2 +}; + +/* Returns true if is a valid ASCII string as defined by the NVMe spec */ +static bool +spdk_nvmf_valid_ascii_string(const void *buf, size_t size) +{ + const uint8_t *str = buf; + size_t i; + + for (i = 0; i < size; i++) { + if (str[i] < 0x20 || str[i] > 0x7E) { + return false; + } + } + + return true; +} + +static bool +spdk_nvmf_valid_nqn(const char *nqn) +{ + size_t len; + struct spdk_uuid uuid_value; + uint32_t i; + int bytes_consumed; + uint32_t domain_label_length; + char *reverse_domain_end; + uint32_t reverse_domain_end_index; + enum spdk_nvmf_nqn_domain_states domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + + /* Check for length requirements */ + len = strlen(nqn); + if (len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu > max %d\n", nqn, len, SPDK_NVMF_NQN_MAX_LEN); + return false; + } + + /* The nqn must be at least as long as SPDK_NVMF_NQN_MIN_LEN to contain the necessary prefix. */ + if (len < SPDK_NVMF_NQN_MIN_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu < min %d\n", nqn, len, SPDK_NVMF_NQN_MIN_LEN); + return false; + } + + /* Check for discovery controller nqn */ + if (!strcmp(nqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + /* Check for equality with the generic nqn structure of the form "nqn.2014-08.org.nvmexpress:uuid:11111111-2222-3333-4444-555555555555" */ + if (!strncmp(nqn, SPDK_NVMF_NQN_UUID_PRE, SPDK_NVMF_NQN_UUID_PRE_LEN)) { + if (len != SPDK_NVMF_NQN_UUID_PRE_LEN + SPDK_NVMF_UUID_STRING_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not the correct length\n", nqn); + return false; + } + + if (spdk_uuid_parse(&uuid_value, &nqn[SPDK_NVMF_NQN_UUID_PRE_LEN])) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not formatted correctly\n", nqn); + return false; + } + return true; + } + + /* If the nqn does not match the uuid structure, the next several checks validate the form "nqn.yyyy-mm.reverse.domain:user-string" */ + + if (strncmp(nqn, "nqn.", 4) != 0) { + SPDK_ERRLOG("Invalid NQN \"%s\": NQN must begin with \"nqn.\".\n", nqn); + return false; + } + + /* Check for yyyy-mm. */ + if (!(isdigit(nqn[4]) && isdigit(nqn[5]) && isdigit(nqn[6]) && isdigit(nqn[7]) && + nqn[8] == '-' && isdigit(nqn[9]) && isdigit(nqn[10]) && nqn[11] == '.')) { + SPDK_ERRLOG("Invalid date code in NQN \"%s\"\n", nqn); + return false; + } + + reverse_domain_end = strchr(nqn, ':'); + if (reverse_domain_end != NULL && (reverse_domain_end_index = reverse_domain_end - nqn) < len - 1) { + } else { + SPDK_ERRLOG("Invalid NQN \"%s\". NQN must contain user specified name with a ':' as a prefix.\n", + nqn); + return false; + } + + /* Check for valid reverse domain */ + domain_label_length = 0; + for (i = 12; i < reverse_domain_end_index; i++) { + if (domain_label_length > SPDK_DOMAIN_LABEL_MAX_LEN) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". At least one Label is too long.\n", nqn); + return false; + } + + switch (domain_state) { + + case SPDK_NVMF_DOMAIN_ACCEPT_LETTER: { + if (isalpha(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must start with a letter.\n", nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_LDH: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_ANY: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + domain_label_length = 0; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + } + } + + i = reverse_domain_end_index + 1; + while (i < len) { + bytes_consumed = utf8_valid(&nqn[i], &nqn[len]); + if (bytes_consumed <= 0) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only valid utf-8.\n", nqn); + return false; + } + + i += bytes_consumed; + } + return true; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt, + const char *nqn, + enum spdk_nvmf_subtype type, + uint32_t num_ns) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (spdk_nvmf_tgt_find_subsystem(tgt, nqn)) { + SPDK_ERRLOG("Subsystem NQN '%s' already exists\n", nqn); + return NULL; + } + + if (!spdk_nvmf_valid_nqn(nqn)) { + return NULL; + } + + if (type == SPDK_NVMF_SUBTYPE_DISCOVERY && num_ns != 0) { + SPDK_ERRLOG("Discovery subsystem cannot have namespaces.\n"); + return NULL; + } + + /* Find a free subsystem id (sid) */ + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + if (tgt->subsystems[sid] == NULL) { + break; + } + } + if (sid >= tgt->opts.max_subsystems) { + return NULL; + } + + subsystem = calloc(1, sizeof(struct spdk_nvmf_subsystem)); + if (subsystem == NULL) { + return NULL; + } + + subsystem->thread = spdk_get_thread(); + subsystem->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + subsystem->tgt = tgt; + subsystem->id = sid; + subsystem->subtype = type; + subsystem->max_nsid = num_ns; + subsystem->max_allowed_nsid = num_ns; + subsystem->next_cntlid = 0; + snprintf(subsystem->subnqn, sizeof(subsystem->subnqn), "%s", nqn); + TAILQ_INIT(&subsystem->listeners); + TAILQ_INIT(&subsystem->hosts); + TAILQ_INIT(&subsystem->ctrlrs); + + if (num_ns != 0) { + subsystem->ns = calloc(num_ns, sizeof(struct spdk_nvmf_ns *)); + if (subsystem->ns == NULL) { + SPDK_ERRLOG("Namespace memory allocation failed\n"); + free(subsystem); + return NULL; + } + } + + memset(subsystem->sn, '0', sizeof(subsystem->sn) - 1); + subsystem->sn[sizeof(subsystem->sn) - 1] = '\n'; + + tgt->subsystems[sid] = subsystem; + tgt->discovery_genctr++; + + return subsystem; +} + +static void +_spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_host *host) +{ + TAILQ_REMOVE(&subsystem->hosts, host, link); + free(host->nqn); + free(host); +} + +static int _spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid); + +void +spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_listener *listener, *listener_tmp; + struct spdk_nvmf_host *host, *host_tmp; + struct spdk_nvmf_ctrlr *ctrlr, *ctrlr_tmp; + struct spdk_nvmf_ns *ns; + + if (!subsystem) { + return; + } + + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "subsystem is %p\n", subsystem); + + TAILQ_FOREACH_SAFE(listener, &subsystem->listeners, link, listener_tmp) { + TAILQ_REMOVE(&subsystem->listeners, listener, link); + free(listener); + } + + TAILQ_FOREACH_SAFE(host, &subsystem->hosts, link, host_tmp) { + _spdk_nvmf_subsystem_remove_host(subsystem, host); + } + + TAILQ_FOREACH_SAFE(ctrlr, &subsystem->ctrlrs, link, ctrlr_tmp) { + spdk_nvmf_ctrlr_destruct(ctrlr); + } + + ns = spdk_nvmf_subsystem_get_first_ns(subsystem); + while (ns != NULL) { + struct spdk_nvmf_ns *next_ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns); + + _spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid); + ns = next_ns; + } + + free(subsystem->ns); + + subsystem->tgt->subsystems[subsystem->id] = NULL; + subsystem->tgt->discovery_genctr++; + + free(subsystem); +} + +static int +spdk_nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state state) +{ + enum spdk_nvmf_subsystem_state actual_old_state, expected_old_state; + + switch (state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + case SPDK_NVMF_SUBSYSTEM_RESUMING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSED; + break; + case SPDK_NVMF_SUBSYSTEM_DEACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + default: + assert(false); + return -1; + } + + actual_old_state = __sync_val_compare_and_swap(&subsystem->state, expected_old_state, state); + if (actual_old_state != expected_old_state) { + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING && + state == SPDK_NVMF_SUBSYSTEM_ACTIVE) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } + /* This is for the case when activating the subsystem fails. */ + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_ACTIVATING && + state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + actual_old_state = __sync_val_compare_and_swap(&subsystem->state, expected_old_state, state); + } + assert(actual_old_state == expected_old_state); + return actual_old_state - expected_old_state; +} + +struct subsystem_state_change_ctx { + struct spdk_nvmf_subsystem *subsystem; + + enum spdk_nvmf_subsystem_state requested_state; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_state_change_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (status == 0) { + status = spdk_nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state); + if (status) { + status = -1; + } + } + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_state_change_continue(void *ctx, int status) +{ + struct spdk_io_channel_iter *i = ctx; + spdk_for_each_channel_continue(i, status); +} + +static void +subsystem_state_change_on_pg(struct spdk_io_channel_iter *i) +{ + struct subsystem_state_change_ctx *ctx; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + + ctx = spdk_io_channel_iter_get_ctx(i); + ch = spdk_io_channel_iter_get_channel(i); + group = spdk_io_channel_get_ctx(ch); + + switch (ctx->requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + spdk_nvmf_poll_group_remove_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_ACTIVATING) { + spdk_nvmf_poll_group_add_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } else if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_RESUMING) { + spdk_nvmf_poll_group_resume_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + spdk_nvmf_poll_group_pause_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + default: + assert(false); + break; + } +} + +static int +spdk_nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state requested_state, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + struct subsystem_state_change_ctx *ctx; + enum spdk_nvmf_subsystem_state intermediate_state; + int rc; + + switch (requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) { + intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } else { + intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + default: + assert(false); + return -EINVAL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return -ENOMEM; + } + + rc = spdk_nvmf_subsystem_set_state(subsystem, intermediate_state); + if (rc) { + free(ctx); + return rc; + } + + ctx->subsystem = subsystem; + ctx->requested_state = requested_state; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(subsystem->tgt, + subsystem_state_change_on_pg, + ctx, + subsystem_state_change_done); + + return 0; +} + +int +spdk_nvmf_subsystem_start(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_stop(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_INACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_pause(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_PAUSED, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_resume(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_first(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_next(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t sid; + struct spdk_nvmf_tgt *tgt; + + if (!subsystem) { + return NULL; + } + + tgt = subsystem->tgt; + + for (sid = subsystem->id + 1; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +static struct spdk_nvmf_host * +_spdk_nvmf_subsystem_find_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host = NULL; + + TAILQ_FOREACH(host, &subsystem->hosts, link) { + if (strcmp(hostnqn, host->nqn) == 0) { + return host; + } + } + + return NULL; +} + +int +spdk_nvmf_subsystem_add_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!spdk_nvmf_valid_nqn(hostnqn)) { + return -EINVAL; + } + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + if (_spdk_nvmf_subsystem_find_host(subsystem, hostnqn)) { + /* This subsystem already allows the specified host. */ + return 0; + } + + host = calloc(1, sizeof(*host)); + if (!host) { + return -ENOMEM; + } + host->nqn = strdup(hostnqn); + if (!host->nqn) { + free(host); + return -ENOMEM; + } + + TAILQ_INSERT_HEAD(&subsystem->hosts, host, link); + subsystem->tgt->discovery_genctr++; + + return 0; +} + +int +spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + host = _spdk_nvmf_subsystem_find_host(subsystem, hostnqn); + if (host == NULL) { + return -ENOENT; + } + + _spdk_nvmf_subsystem_remove_host(subsystem, host); + return 0; +} + +int +spdk_nvmf_subsystem_set_allow_any_host(struct spdk_nvmf_subsystem *subsystem, bool allow_any_host) +{ + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + subsystem->allow_any_host = allow_any_host; + + return 0; +} + +bool +spdk_nvmf_subsystem_get_allow_any_host(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->allow_any_host; +} + +bool +spdk_nvmf_subsystem_host_allowed(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + if (!hostnqn) { + return false; + } + + if (subsystem->allow_any_host) { + return true; + } + + return _spdk_nvmf_subsystem_find_host(subsystem, hostnqn) != NULL; +} + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_first_host(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->hosts); +} + + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_next_host(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_host *prev_host) +{ + return TAILQ_NEXT(prev_host, link); +} + +const char * +spdk_nvmf_host_get_nqn(struct spdk_nvmf_host *host) +{ + return host->nqn; +} + +static struct spdk_nvmf_listener * +_spdk_nvmf_subsystem_find_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) { + return listener; + } + } + + return NULL; +} + +int +spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_listener *listener; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + if (_spdk_nvmf_subsystem_find_listener(subsystem, trid)) { + /* Listener already exists in this subsystem */ + return 0; + } + + transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, trid->trtype); + if (transport == NULL) { + SPDK_ERRLOG("Unknown transport type %d\n", trid->trtype); + return -EINVAL; + } + + listener = calloc(1, sizeof(*listener)); + if (!listener) { + return -ENOMEM; + } + + listener->trid = *trid; + listener->transport = transport; + + TAILQ_INSERT_HEAD(&subsystem->listeners, listener, link); + + return 0; +} + +int +spdk_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + listener = _spdk_nvmf_subsystem_find_listener(subsystem, trid); + if (listener == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&subsystem->listeners, listener, link); + free(listener); + + return 0; +} + +bool +spdk_nvmf_subsystem_listener_allowed(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + if (!strcmp(subsystem->subnqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) { + return true; + } + } + + return false; +} + +struct spdk_nvmf_listener * +spdk_nvmf_subsystem_get_first_listener(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->listeners); +} + +struct spdk_nvmf_listener * +spdk_nvmf_subsystem_get_next_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_listener *prev_listener) +{ + return TAILQ_NEXT(prev_listener, link); +} + +const struct spdk_nvme_transport_id * +spdk_nvmf_listener_get_trid(struct spdk_nvmf_listener *listener) +{ + return &listener->trid; +} + +struct subsystem_update_ns_ctx { + struct spdk_nvmf_subsystem *subsystem; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_update_ns_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_update_ns_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_update_ns_on_pg(struct spdk_io_channel_iter *i) +{ + int rc; + struct subsystem_update_ns_ctx *ctx; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem *subsystem; + + ctx = spdk_io_channel_iter_get_ctx(i); + group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i)); + subsystem = ctx->subsystem; + + rc = spdk_nvmf_poll_group_update_subsystem(group, subsystem); + spdk_for_each_channel_continue(i, rc); +} + +static int +spdk_nvmf_subsystem_update_ns(struct spdk_nvmf_subsystem *subsystem, spdk_channel_for_each_cpl cpl, + void *ctx) +{ + spdk_for_each_channel(subsystem->tgt, + subsystem_update_ns_on_pg, + ctx, + cpl); + + return 0; +} + +static void +spdk_nvmf_subsystem_ns_changed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + spdk_nvmf_ctrlr_ns_changed(ctrlr, nsid); + } +} + +static int +_spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ns *ns; + + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED || + subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE); + + if (nsid == 0 || nsid > subsystem->max_nsid) { + return -1; + } + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -1; + } + + ns = subsystem->ns[nsid - 1]; + if (!ns) { + return -1; + } + + subsystem->ns[nsid - 1] = NULL; + + spdk_bdev_module_release_bdev(ns->bdev); + spdk_bdev_close(ns->desc); + free(ns); + + spdk_nvmf_subsystem_ns_changed(subsystem, nsid); + + return 0; +} + +int +spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid, + spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg) +{ + int rc; + struct subsystem_update_ns_ctx *ctx; + + rc = _spdk_nvmf_subsystem_remove_ns(subsystem, nsid); + if (rc < 0) { + return rc; + } + + ctx = calloc(1, sizeof(*ctx)); + + if (ctx == NULL) { + return -ENOMEM; + } + + ctx->subsystem = subsystem; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_nvmf_subsystem_update_ns(subsystem, subsystem_update_ns_done, ctx); + + return 0; +} + +static void +_spdk_nvmf_ns_hot_remove_done(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + if (status != 0) { + SPDK_ERRLOG("Failed to make changes to NVMe-oF subsystem with id %u\n", subsystem->id); + } + spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); +} + +static void +_spdk_nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_nvmf_ns *ns = cb_arg; + + spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid, _spdk_nvmf_ns_hot_remove_done, + subsystem); +} + +static void +spdk_nvmf_ns_hot_remove(void *remove_ctx) +{ + struct spdk_nvmf_ns *ns = remove_ctx; + int rc; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _spdk_nvmf_ns_hot_remove, ns); + if (rc) { + SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + } +} + +void +spdk_nvmf_ns_opts_get_defaults(struct spdk_nvmf_ns_opts *opts, size_t opts_size) +{ + /* All current fields are set to 0 by default. */ + memset(opts, 0, opts_size); +} + +/* Dummy bdev module used to to claim bdevs. */ +static struct spdk_bdev_module ns_bdev_module = { + .name = "NVMe-oF Target", +}; + +uint32_t +spdk_nvmf_subsystem_add_ns(struct spdk_nvmf_subsystem *subsystem, struct spdk_bdev *bdev, + const struct spdk_nvmf_ns_opts *user_opts, size_t opts_size) +{ + struct spdk_nvmf_ns_opts opts; + struct spdk_nvmf_ns *ns; + int rc; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return 0; + } + + spdk_nvmf_ns_opts_get_defaults(&opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + } + + if (spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) { + opts.uuid = *spdk_bdev_get_uuid(bdev); + } + + if (opts.nsid == SPDK_NVME_GLOBAL_NS_TAG) { + SPDK_ERRLOG("Invalid NSID %" PRIu32 "\n", opts.nsid); + return 0; + } + + if (opts.nsid == 0) { + /* + * NSID not specified - find a free index. + * + * If no free slots are found, opts.nsid will be subsystem->max_nsid + 1, which will + * expand max_nsid if possible. + */ + for (opts.nsid = 1; opts.nsid <= subsystem->max_nsid; opts.nsid++) { + if (_spdk_nvmf_subsystem_get_ns(subsystem, opts.nsid) == NULL) { + break; + } + } + } + + if (_spdk_nvmf_subsystem_get_ns(subsystem, opts.nsid)) { + SPDK_ERRLOG("Requested NSID %" PRIu32 " already in use\n", opts.nsid); + return 0; + } + + if (opts.nsid > subsystem->max_nsid) { + struct spdk_nvmf_ns **new_ns_array; + + /* If MaxNamespaces was specified, we can't extend max_nsid beyond it. */ + if (subsystem->max_allowed_nsid > 0 && opts.nsid > subsystem->max_allowed_nsid) { + SPDK_ERRLOG("Can't extend NSID range above MaxNamespaces\n"); + return 0; + } + + /* If a controller is connected, we can't change NN. */ + if (!TAILQ_EMPTY(&subsystem->ctrlrs)) { + SPDK_ERRLOG("Can't extend NSID range while controllers are connected\n"); + return 0; + } + + new_ns_array = realloc(subsystem->ns, sizeof(struct spdk_nvmf_ns *) * opts.nsid); + if (new_ns_array == NULL) { + SPDK_ERRLOG("Memory allocation error while resizing namespace array.\n"); + return 0; + } + + memset(new_ns_array + subsystem->max_nsid, 0, + sizeof(struct spdk_nvmf_ns *) * (opts.nsid - subsystem->max_nsid)); + subsystem->ns = new_ns_array; + subsystem->max_nsid = opts.nsid; + } + + ns = calloc(1, sizeof(*ns)); + if (ns == NULL) { + SPDK_ERRLOG("Namespace allocation failed\n"); + return 0; + } + + ns->bdev = bdev; + ns->opts = opts; + ns->subsystem = subsystem; + rc = spdk_bdev_open(bdev, true, spdk_nvmf_ns_hot_remove, ns, &ns->desc); + if (rc != 0) { + SPDK_ERRLOG("Subsystem %s: bdev %s cannot be opened, error=%d\n", + subsystem->subnqn, spdk_bdev_get_name(bdev), rc); + free(ns); + return 0; + } + rc = spdk_bdev_module_claim_bdev(bdev, ns->desc, &ns_bdev_module); + if (rc != 0) { + spdk_bdev_close(ns->desc); + free(ns); + return 0; + } + subsystem->ns[opts.nsid - 1] = ns; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Subsystem %s: bdev %s assigned nsid %" PRIu32 "\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + spdk_bdev_get_name(bdev), + opts.nsid); + + spdk_nvmf_subsystem_ns_changed(subsystem, opts.nsid); + + return opts.nsid; +} + +static uint32_t +spdk_nvmf_subsystem_get_next_allocated_nsid(struct spdk_nvmf_subsystem *subsystem, + uint32_t prev_nsid) +{ + uint32_t nsid; + + if (prev_nsid >= subsystem->max_nsid) { + return 0; + } + + for (nsid = prev_nsid + 1; nsid <= subsystem->max_nsid; nsid++) { + if (subsystem->ns[nsid - 1]) { + return nsid; + } + } + + return 0; +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_first_ns(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t first_nsid; + + first_nsid = spdk_nvmf_subsystem_get_next_allocated_nsid(subsystem, 0); + return _spdk_nvmf_subsystem_get_ns(subsystem, first_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_next_ns(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ns *prev_ns) +{ + uint32_t next_nsid; + + next_nsid = spdk_nvmf_subsystem_get_next_allocated_nsid(subsystem, prev_ns->opts.nsid); + return _spdk_nvmf_subsystem_get_ns(subsystem, next_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + return _spdk_nvmf_subsystem_get_ns(subsystem, nsid); +} + +uint32_t +spdk_nvmf_ns_get_id(const struct spdk_nvmf_ns *ns) +{ + return ns->opts.nsid; +} + +struct spdk_bdev * +spdk_nvmf_ns_get_bdev(struct spdk_nvmf_ns *ns) +{ + return ns->bdev; +} + +void +spdk_nvmf_ns_get_opts(const struct spdk_nvmf_ns *ns, struct spdk_nvmf_ns_opts *opts, + size_t opts_size) +{ + memset(opts, 0, opts_size); + memcpy(opts, &ns->opts, spdk_min(sizeof(ns->opts), opts_size)); +} + +const char * +spdk_nvmf_subsystem_get_sn(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->sn; +} + +int +spdk_nvmf_subsystem_set_sn(struct spdk_nvmf_subsystem *subsystem, const char *sn) +{ + size_t len, max_len; + + max_len = sizeof(subsystem->sn) - 1; + len = strlen(sn); + if (len > max_len) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid sn \"%s\": length %zu > max %zu\n", + sn, len, max_len); + return -1; + } + + if (!spdk_nvmf_valid_ascii_string(sn, len)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII sn\n"); + SPDK_TRACEDUMP(SPDK_LOG_NVMF, "sn", sn, len); + return -1; + } + + snprintf(subsystem->sn, sizeof(subsystem->sn), "%s", sn); + + return 0; +} + +const char * +spdk_nvmf_subsystem_get_nqn(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subnqn; +} + +/* Workaround for astyle formatting bug */ +typedef enum spdk_nvmf_subtype nvmf_subtype_t; + +nvmf_subtype_t +spdk_nvmf_subsystem_get_type(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subtype; +} + +static uint16_t +spdk_nvmf_subsystem_gen_cntlid(struct spdk_nvmf_subsystem *subsystem) +{ + int count; + + /* + * In the worst case, we might have to try all CNTLID values between 1 and 0xFFF0 - 1 + * before we find one that is unused (or find that all values are in use). + */ + for (count = 0; count < 0xFFF0 - 1; count++) { + subsystem->next_cntlid++; + if (subsystem->next_cntlid >= 0xFFF0) { + /* The spec reserves cntlid values in the range FFF0h to FFFFh. */ + subsystem->next_cntlid = 1; + } + + /* Check if a controller with this cntlid currently exists. */ + if (spdk_nvmf_subsystem_get_ctrlr(subsystem, subsystem->next_cntlid) == NULL) { + /* Found unused cntlid */ + return subsystem->next_cntlid; + } + } + + /* All valid cntlid values are in use. */ + return 0xFFFF; +} + +int +spdk_nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_ctrlr *ctrlr) +{ + ctrlr->cntlid = spdk_nvmf_subsystem_gen_cntlid(subsystem); + if (ctrlr->cntlid == 0xFFFF) { + /* Unable to get a cntlid */ + SPDK_ERRLOG("Reached max simultaneous ctrlrs\n"); + return -EBUSY; + } + + TAILQ_INSERT_TAIL(&subsystem->ctrlrs, ctrlr, link); + + return 0; +} + +void +spdk_nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr) +{ + assert(subsystem == ctrlr->subsys); + TAILQ_REMOVE(&subsystem->ctrlrs, ctrlr, link); +} + +struct spdk_nvmf_ctrlr * +spdk_nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + if (ctrlr->cntlid == cntlid) { + return ctrlr; + } + } + + return NULL; +} + +uint32_t +spdk_nvmf_subsystem_get_max_namespaces(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->max_allowed_nsid; +} diff --git a/src/spdk/lib/nvmf/transport.c b/src/spdk/lib/nvmf/transport.c new file mode 100644 index 00000000..af4660c9 --- /dev/null +++ b/src/spdk/lib/nvmf/transport.c @@ -0,0 +1,236 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/config.h" +#include "spdk/log.h" +#include "spdk/nvmf.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +static const struct spdk_nvmf_transport_ops *const g_transport_ops[] = { +#ifdef SPDK_CONFIG_RDMA + &spdk_nvmf_transport_rdma, +#endif +}; + +#define NUM_TRANSPORTS (SPDK_COUNTOF(g_transport_ops)) + +static inline const struct spdk_nvmf_transport_ops * +spdk_nvmf_get_transport_ops(enum spdk_nvme_transport_type type) +{ + size_t i; + for (i = 0; i != NUM_TRANSPORTS; i++) { + if (g_transport_ops[i]->type == type) { + return g_transport_ops[i]; + } + } + return NULL; +} + +struct spdk_nvmf_transport * +spdk_nvmf_transport_create(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops = NULL; + struct spdk_nvmf_transport *transport; + + if ((opts->max_io_size % opts->io_unit_size != 0) || + (opts->max_io_size / opts->io_unit_size > + SPDK_NVMF_MAX_SGL_ENTRIES)) { + SPDK_ERRLOG("%s: invalid IO size, MaxIO:%d, UnitIO:%d, MaxSGL:%d\n", + spdk_nvme_transport_id_trtype_str(type), + opts->max_io_size, + opts->io_unit_size, + SPDK_NVMF_MAX_SGL_ENTRIES); + return NULL; + } + + ops = spdk_nvmf_get_transport_ops(type); + if (!ops) { + SPDK_ERRLOG("Transport type %s unavailable.\n", + spdk_nvme_transport_id_trtype_str(type)); + return NULL; + } + + transport = ops->create(opts); + if (!transport) { + SPDK_ERRLOG("Unable to create new transport of type %s\n", + spdk_nvme_transport_id_trtype_str(type)); + return NULL; + } + + transport->ops = ops; + transport->opts = *opts; + + return transport; +} + +int +spdk_nvmf_transport_destroy(struct spdk_nvmf_transport *transport) +{ + return transport->ops->destroy(transport); +} + +int +spdk_nvmf_transport_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + return transport->ops->listen(transport, trid); +} + +int +spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + return transport->ops->stop_listen(transport, trid); +} + +void +spdk_nvmf_transport_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + transport->ops->accept(transport, cb_fn); +} + +void +spdk_nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + transport->ops->listener_discover(transport, trid, entry); +} + +struct spdk_nvmf_transport_poll_group * +spdk_nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *group; + + group = transport->ops->poll_group_create(transport); + group->transport = transport; + + return group; +} + +void +spdk_nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + group->transport->ops->poll_group_destroy(group); +} + +int +spdk_nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + if (qpair->transport) { + assert(qpair->transport == group->transport); + if (qpair->transport != group->transport) { + return -1; + } + } else { + qpair->transport = group->transport; + } + + return group->transport->ops->poll_group_add(group, qpair); +} + +int +spdk_nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + return group->transport->ops->poll_group_poll(group); +} + +int +spdk_nvmf_transport_req_free(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_free(req); +} + +int +spdk_nvmf_transport_req_complete(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_complete(req); +} + +void +spdk_nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair) +{ + qpair->transport->ops->qpair_fini(qpair); +} + +bool +spdk_nvmf_transport_qpair_is_idle(struct spdk_nvmf_qpair *qpair) +{ + return qpair->transport->ops->qpair_is_idle(qpair); +} + +int +spdk_nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_peer_trid(qpair, trid); +} + +int +spdk_nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_local_trid(qpair, trid); +} + +int +spdk_nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_listen_trid(qpair, trid); +} + +bool +spdk_nvmf_transport_opts_init(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops; + + ops = spdk_nvmf_get_transport_ops(type); + if (!ops) { + SPDK_ERRLOG("Transport type %s unavailable.\n", + spdk_nvme_transport_id_trtype_str(type)); + return false; + } + + ops->opts_init(opts); + return true; +} diff --git a/src/spdk/lib/nvmf/transport.h b/src/spdk/lib/nvmf/transport.h new file mode 100644 index 00000000..1329a80c --- /dev/null +++ b/src/spdk/lib/nvmf/transport.h @@ -0,0 +1,200 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NVMF_TRANSPORT_H +#define SPDK_NVMF_TRANSPORT_H + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/nvmf.h" + +struct spdk_nvmf_transport { + struct spdk_nvmf_tgt *tgt; + const struct spdk_nvmf_transport_ops *ops; + struct spdk_nvmf_transport_opts opts; + + TAILQ_ENTRY(spdk_nvmf_transport) link; +}; + +struct spdk_nvmf_transport_ops { + /** + * Transport type + */ + enum spdk_nvme_transport_type type; + + /** + * Initialize transport options to default value + */ + void (*opts_init)(struct spdk_nvmf_transport_opts *opts); + + /** + * Create a transport for the given transport opts + */ + struct spdk_nvmf_transport *(*create)(struct spdk_nvmf_transport_opts *opts); + + /** + * Destroy the transport + */ + int (*destroy)(struct spdk_nvmf_transport *transport); + + /** + * Instruct the transport to accept new connections at the address + * provided. This may be called multiple times. + */ + int (*listen)(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + + /** + * Stop accepting new connections at the given address. + */ + int (*stop_listen)(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + + /** + * Check for new connections on the transport. + */ + void (*accept)(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn); + + /** + * Fill out a discovery log entry for a specific listen address. + */ + void (*listener_discover)(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry); + + /** + * Create a new poll group + */ + struct spdk_nvmf_transport_poll_group *(*poll_group_create)(struct spdk_nvmf_transport *transport); + + /** + * Destroy a poll group + */ + void (*poll_group_destroy)(struct spdk_nvmf_transport_poll_group *group); + + /** + * Add a qpair to a poll group + */ + int (*poll_group_add)(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + + /** + * Poll the group to process I/O + */ + int (*poll_group_poll)(struct spdk_nvmf_transport_poll_group *group); + + /* + * Free the request without sending a response + * to the originator. Release memory tied to this request. + */ + int (*req_free)(struct spdk_nvmf_request *req); + + /* + * Signal request completion, which sends a response + * to the originator. + */ + int (*req_complete)(struct spdk_nvmf_request *req); + + /* + * Deinitialize a connection. + */ + void (*qpair_fini)(struct spdk_nvmf_qpair *qpair); + + /* + * True if the qpair has no pending IO. + */ + bool (*qpair_is_idle)(struct spdk_nvmf_qpair *qpair); + + /* + * Get the peer transport ID for the queue pair. + */ + int (*qpair_get_peer_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + + /* + * Get the local transport ID for the queue pair. + */ + int (*qpair_get_local_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + + /* + * Get the listener transport ID that accepted this qpair originally. + */ + int (*qpair_get_listen_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); +}; + + +int spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + +void spdk_nvmf_transport_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn); + +void spdk_nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry); + +struct spdk_nvmf_transport_poll_group *spdk_nvmf_transport_poll_group_create( + struct spdk_nvmf_transport *transport); + +void spdk_nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); + +int spdk_nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + +int spdk_nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); + +int spdk_nvmf_transport_req_free(struct spdk_nvmf_request *req); + +int spdk_nvmf_transport_req_complete(struct spdk_nvmf_request *req); + +void spdk_nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair); + +bool spdk_nvmf_transport_qpair_is_idle(struct spdk_nvmf_qpair *qpair); + +int spdk_nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int spdk_nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int spdk_nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +bool spdk_nvmf_transport_opts_init(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts); + +extern const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; + +#endif /* SPDK_NVMF_TRANSPORT_H */ diff --git a/src/spdk/lib/rocksdb/env_spdk.cc b/src/spdk/lib/rocksdb/env_spdk.cc new file mode 100644 index 00000000..63c979eb --- /dev/null +++ b/src/spdk/lib/rocksdb/env_spdk.cc @@ -0,0 +1,764 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "rocksdb/env.h" +#include +#include +#include + +extern "C" { +#include "spdk/env.h" +#include "spdk/event.h" +#include "spdk/blob.h" +#include "spdk/blobfs.h" +#include "spdk/blob_bdev.h" +#include "spdk/log.h" +#include "spdk/thread.h" +#include "spdk/bdev.h" +} + +namespace rocksdb +{ + +struct spdk_filesystem *g_fs = NULL; +struct spdk_bs_dev *g_bs_dev; +uint32_t g_lcore = 0; +std::string g_bdev_name; +volatile bool g_spdk_ready = false; +volatile bool g_spdk_start_failure = false; +struct sync_args { + struct spdk_io_channel *channel; +}; + +__thread struct sync_args g_sync_args; + +static void +__call_fn(void *arg1, void *arg2) +{ + fs_request_fn fn; + + fn = (fs_request_fn)arg1; + fn(arg2); +} + +static void +__send_request(fs_request_fn fn, void *arg) +{ + struct spdk_event *event; + + event = spdk_event_allocate(g_lcore, __call_fn, (void *)fn, arg); + spdk_event_call(event); +} + +static std::string +sanitize_path(const std::string &input, const std::string &mount_directory) +{ + int index = 0; + std::string name; + std::string input_tmp; + + input_tmp = input.substr(mount_directory.length(), input.length()); + for (const char &c : input_tmp) { + if (index == 0) { + if (c != '/') { + name = name.insert(index, 1, '/'); + index++; + } + name = name.insert(index, 1, c); + index++; + } else { + if (name[index - 1] == '/' && c == '/') { + continue; + } else { + name = name.insert(index, 1, c); + index++; + } + } + } + + if (name[name.size() - 1] == '/') { + name = name.erase(name.size() - 1, 1); + } + return name; +} + +class SpdkSequentialFile : public SequentialFile +{ + struct spdk_file *mFile; + uint64_t mOffset; +public: + SpdkSequentialFile(struct spdk_file *file) : mFile(file), mOffset(0) {} + virtual ~SpdkSequentialFile(); + + virtual Status Read(size_t n, Slice *result, char *scratch) override; + virtual Status Skip(uint64_t n) override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +SpdkSequentialFile::~SpdkSequentialFile(void) +{ + spdk_file_close(mFile, g_sync_args.channel); +} + +Status +SpdkSequentialFile::Read(size_t n, Slice *result, char *scratch) +{ + int64_t ret; + + ret = spdk_file_read(mFile, g_sync_args.channel, scratch, mOffset, n); + if (ret >= 0) { + mOffset += ret; + *result = Slice(scratch, ret); + return Status::OK(); + } else { + errno = -ret; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +Status +SpdkSequentialFile::Skip(uint64_t n) +{ + mOffset += n; + return Status::OK(); +} + +Status +SpdkSequentialFile::InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) +{ + return Status::OK(); +} + +class SpdkRandomAccessFile : public RandomAccessFile +{ + struct spdk_file *mFile; +public: + SpdkRandomAccessFile(struct spdk_file *file) : mFile(file) {} + virtual ~SpdkRandomAccessFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice *result, char *scratch) const override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +SpdkRandomAccessFile::~SpdkRandomAccessFile(void) +{ + spdk_file_close(mFile, g_sync_args.channel); +} + +Status +SpdkRandomAccessFile::Read(uint64_t offset, size_t n, Slice *result, char *scratch) const +{ + int64_t rc; + + rc = spdk_file_read(mFile, g_sync_args.channel, scratch, offset, n); + if (rc >= 0) { + *result = Slice(scratch, n); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +Status +SpdkRandomAccessFile::InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) +{ + return Status::OK(); +} + +class SpdkWritableFile : public WritableFile +{ + struct spdk_file *mFile; + uint64_t mSize; + +public: + SpdkWritableFile(struct spdk_file *file) : mFile(file), mSize(0) {} + ~SpdkWritableFile() + { + if (mFile != NULL) { + Close(); + } + } + + virtual void SetIOPriority(Env::IOPriority pri) + { + if (pri == Env::IO_HIGH) { + spdk_file_set_priority(mFile, SPDK_FILE_PRIORITY_HIGH); + } + } + + virtual Status Truncate(uint64_t size) override + { + int rc; + rc = spdk_file_truncate(mFile, g_sync_args.channel, size); + if (!rc) { + mSize = size; + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status Close() override + { + spdk_file_close(mFile, g_sync_args.channel); + mFile = NULL; + return Status::OK(); + } + virtual Status Append(const Slice &data) override; + virtual Status Flush() override + { + return Status::OK(); + } + virtual Status Sync() override + { + int rc; + + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status Fsync() override + { + int rc; + + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual bool IsSyncThreadSafe() const override + { + return true; + } + virtual uint64_t GetFileSize() override + { + return mSize; + } + virtual Status InvalidateCache(__attribute__((unused)) size_t offset, + __attribute__((unused)) size_t length) override + { + return Status::OK(); + } + virtual Status Allocate(uint64_t offset, uint64_t len) override + { + int rc; + + rc = spdk_file_truncate(mFile, g_sync_args.channel, offset + len); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual Status RangeSync(__attribute__((unused)) uint64_t offset, + __attribute__((unused)) uint64_t nbytes) override + { + int rc; + + /* + * SPDK BlobFS does not have a range sync operation yet, so just sync + * the whole file. + */ + rc = spdk_file_sync(mFile, g_sync_args.channel); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } + } + virtual size_t GetUniqueId(char *id, size_t max_size) const override + { + int rc; + + rc = spdk_file_get_id(mFile, id, max_size); + if (rc < 0) { + return 0; + } else { + return rc; + } + } +}; + +Status +SpdkWritableFile::Append(const Slice &data) +{ + int64_t rc; + + rc = spdk_file_write(mFile, g_sync_args.channel, (void *)data.data(), mSize, data.size()); + if (rc >= 0) { + mSize += data.size(); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(spdk_file_get_name(mFile), strerror(errno)); + } +} + +class SpdkDirectory : public Directory +{ +public: + SpdkDirectory() {} + ~SpdkDirectory() {} + Status Fsync() override + { + return Status::OK(); + } +}; + +class SpdkAppStartException : public std::runtime_error +{ +public: + SpdkAppStartException(std::string mess): std::runtime_error(mess) {} +}; + +class SpdkEnv : public EnvWrapper +{ +private: + pthread_t mSpdkTid; + std::string mDirectory; + std::string mConfig; + std::string mBdev; + +public: + SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb); + + virtual ~SpdkEnv(); + + virtual Status NewSequentialFile(const std::string &fname, + unique_ptr *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + struct spdk_file *file; + int rc; + + std::string name = sanitize_path(fname, mDirectory); + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, + name.c_str(), 0, &file); + if (rc == 0) { + result->reset(new SpdkSequentialFile(file)); + return Status::OK(); + } else { + /* Myrocks engine uses errno(ENOENT) as one + * special condition, for the purpose to + * support MySQL, set the errno to right value. + */ + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewSequentialFile(fname, result, options); + } + } + + virtual Status NewRandomAccessFile(const std::string &fname, + unique_ptr *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + std::string name = sanitize_path(fname, mDirectory); + struct spdk_file *file; + int rc; + + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, + name.c_str(), 0, &file); + if (rc == 0) { + result->reset(new SpdkRandomAccessFile(file)); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewRandomAccessFile(fname, result, options); + } + } + + virtual Status NewWritableFile(const std::string &fname, + unique_ptr *result, + const EnvOptions &options) override + { + if (fname.compare(0, mDirectory.length(), mDirectory) == 0) { + std::string name = sanitize_path(fname, mDirectory); + struct spdk_file *file; + int rc; + + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(), + SPDK_BLOBFS_OPEN_CREATE, &file); + if (rc == 0) { + result->reset(new SpdkWritableFile(file)); + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } else { + return EnvWrapper::NewWritableFile(fname, result, options); + } + } + + virtual Status ReuseWritableFile(const std::string &fname, + const std::string &old_fname, + unique_ptr *result, + const EnvOptions &options) override + { + return EnvWrapper::ReuseWritableFile(fname, old_fname, result, options); + } + + virtual Status NewDirectory(__attribute__((unused)) const std::string &name, + unique_ptr *result) override + { + result->reset(new SpdkDirectory()); + return Status::OK(); + } + virtual Status FileExists(const std::string &fname) override + { + struct spdk_file_stat stat; + int rc; + std::string name = sanitize_path(fname, mDirectory); + + rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat); + if (rc == 0) { + return Status::OK(); + } + return EnvWrapper::FileExists(fname); + } + virtual Status RenameFile(const std::string &src, const std::string &t) override + { + int rc; + std::string src_name = sanitize_path(src, mDirectory); + std::string target_name = sanitize_path(t, mDirectory); + + rc = spdk_fs_rename_file(g_fs, g_sync_args.channel, + src_name.c_str(), target_name.c_str()); + if (rc == -ENOENT) { + return EnvWrapper::RenameFile(src, t); + } + return Status::OK(); + } + virtual Status LinkFile(__attribute__((unused)) const std::string &src, + __attribute__((unused)) const std::string &t) override + { + return Status::NotSupported("SpdkEnv does not support LinkFile"); + } + virtual Status GetFileSize(const std::string &fname, uint64_t *size) override + { + struct spdk_file_stat stat; + int rc; + std::string name = sanitize_path(fname, mDirectory); + + rc = spdk_fs_file_stat(g_fs, g_sync_args.channel, name.c_str(), &stat); + if (rc == -ENOENT) { + return EnvWrapper::GetFileSize(fname, size); + } + *size = stat.size; + return Status::OK(); + } + virtual Status DeleteFile(const std::string &fname) override + { + int rc; + std::string name = sanitize_path(fname, mDirectory); + + rc = spdk_fs_delete_file(g_fs, g_sync_args.channel, name.c_str()); + if (rc == -ENOENT) { + return EnvWrapper::DeleteFile(fname); + } + return Status::OK(); + } + virtual void StartThread(void (*function)(void *arg), void *arg) override; + virtual Status LockFile(const std::string &fname, FileLock **lock) override + { + std::string name = sanitize_path(fname, mDirectory); + int64_t rc; + + rc = spdk_fs_open_file(g_fs, g_sync_args.channel, name.c_str(), + SPDK_BLOBFS_OPEN_CREATE, (struct spdk_file **)lock); + if (!rc) { + return Status::OK(); + } else { + errno = -rc; + return Status::IOError(name, strerror(errno)); + } + } + virtual Status UnlockFile(FileLock *lock) override + { + spdk_file_close((struct spdk_file *)lock, g_sync_args.channel); + return Status::OK(); + } + virtual Status GetChildren(const std::string &dir, + std::vector *result) override + { + std::string::size_type pos; + std::set dir_and_file_set; + std::string full_path; + std::string filename; + std::string dir_name; + + if (dir.find("archive") != std::string::npos) { + return Status::OK(); + } + if (dir.compare(0, mDirectory.length(), mDirectory) == 0) { + spdk_fs_iter iter; + struct spdk_file *file; + dir_name = sanitize_path(dir, mDirectory); + + iter = spdk_fs_iter_first(g_fs); + while (iter != NULL) { + file = spdk_fs_iter_get_file(iter); + full_path = spdk_file_get_name(file); + if (strncmp(dir_name.c_str(), full_path.c_str(), dir_name.length())) { + iter = spdk_fs_iter_next(iter); + continue; + } + pos = full_path.find("/", dir_name.length() + 1); + + if (pos != std::string::npos) { + filename = full_path.substr(dir_name.length() + 1, pos - dir_name.length() - 1); + } else { + filename = full_path.substr(dir_name.length() + 1); + } + dir_and_file_set.insert(filename); + iter = spdk_fs_iter_next(iter); + } + + for (auto &s : dir_and_file_set) { + result->push_back(s); + } + + result->push_back("."); + result->push_back(".."); + + return Status::OK(); + } + return EnvWrapper::GetChildren(dir, result); + } +}; + +static void +_spdk_send_msg(__attribute__((unused)) spdk_thread_fn fn, + __attribute__((unused)) void *ctx, + __attribute__((unused)) void *thread_ctx) +{ + /* Not supported */ + assert(false); +} + +void SpdkInitializeThread(void) +{ + if (g_fs != NULL) { + /* TODO: Add an event lib call to dynamically register a thread */ + spdk_allocate_thread(_spdk_send_msg, NULL, NULL, NULL, "spdk_rocksdb"); + g_sync_args.channel = spdk_fs_alloc_io_channel_sync(g_fs); + } +} + +struct SpdkThreadState { + void (*user_function)(void *); + void *arg; +}; + +static void SpdkStartThreadWrapper(void *arg) +{ + SpdkThreadState *state = reinterpret_cast(arg); + + SpdkInitializeThread(); + state->user_function(state->arg); + delete state; +} + +void SpdkEnv::StartThread(void (*function)(void *arg), void *arg) +{ + SpdkThreadState *state = new SpdkThreadState; + state->user_function = function; + state->arg = arg; + EnvWrapper::StartThread(SpdkStartThreadWrapper, state); +} + +static void +fs_load_cb(__attribute__((unused)) void *ctx, + struct spdk_filesystem *fs, int fserrno) +{ + if (fserrno == 0) { + g_fs = fs; + } + g_spdk_ready = true; +} + +static void +spdk_rocksdb_run(__attribute__((unused)) void *arg1, + __attribute__((unused)) void *arg2) +{ + struct spdk_bdev *bdev; + + bdev = spdk_bdev_get_by_name(g_bdev_name.c_str()); + + if (bdev == NULL) { + SPDK_ERRLOG("bdev %s not found\n", g_bdev_name.c_str()); + exit(1); + } + + g_lcore = spdk_env_get_first_core(); + + g_bs_dev = spdk_bdev_create_bs_dev(bdev, NULL, NULL); + printf("using bdev %s\n", g_bdev_name.c_str()); + spdk_fs_load(g_bs_dev, __send_request, fs_load_cb, NULL); +} + +static void +fs_unload_cb(__attribute__((unused)) void *ctx, + __attribute__((unused)) int fserrno) +{ + assert(fserrno == 0); + + spdk_app_stop(0); +} + +static void +spdk_rocksdb_shutdown(void) +{ + if (g_fs != NULL) { + spdk_fs_unload(g_fs, fs_unload_cb, NULL); + } else { + fs_unload_cb(NULL, 0); + } +} + +static void * +initialize_spdk(void *arg) +{ + struct spdk_app_opts *opts = (struct spdk_app_opts *)arg; + int rc; + + rc = spdk_app_start(opts, spdk_rocksdb_run, NULL, NULL); + /* + * TODO: Revisit for case of internal failure of + * spdk_app_start(), itself. At this time, it's known + * the only application's use of spdk_app_stop() passes + * a zero; i.e. no fail (non-zero) cases so here we + * assume there was an internal failure and flag it + * so we can throw an exception. + */ + if (rc) { + g_spdk_start_failure = true; + } else { + spdk_app_fini(); + delete opts; + } + pthread_exit(NULL); + +} + +SpdkEnv::SpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb) + : EnvWrapper(base_env), mDirectory(dir), mConfig(conf), mBdev(bdev) +{ + struct spdk_app_opts *opts = new struct spdk_app_opts; + + spdk_app_opts_init(opts); + opts->name = "rocksdb"; + opts->config_file = mConfig.c_str(); + opts->mem_size = 1024 + cache_size_in_mb; + opts->shutdown_cb = spdk_rocksdb_shutdown; + + spdk_fs_set_cache_size(cache_size_in_mb); + g_bdev_name = mBdev; + + pthread_create(&mSpdkTid, NULL, &initialize_spdk, opts); + while (!g_spdk_ready && !g_spdk_start_failure) + ; + if (g_spdk_start_failure) { + delete opts; + throw SpdkAppStartException("spdk_app_start() unable to start spdk_rocksdb_run()"); + } + + SpdkInitializeThread(); +} + +SpdkEnv::~SpdkEnv() +{ + /* This is a workaround for rocksdb test, we close the files if the rocksdb not + * do the work before the test quit. + */ + if (g_fs != NULL) { + spdk_fs_iter iter; + struct spdk_file *file; + + if (!g_sync_args.channel) { + SpdkInitializeThread(); + } + iter = spdk_fs_iter_first(g_fs); + while (iter != NULL) { + file = spdk_fs_iter_get_file(iter); + spdk_file_close(file, g_sync_args.channel); + iter = spdk_fs_iter_next(iter); + } + } + + spdk_app_start_shutdown(); + pthread_join(mSpdkTid, NULL); +} + +Env *NewSpdkEnv(Env *base_env, const std::string &dir, const std::string &conf, + const std::string &bdev, uint64_t cache_size_in_mb) +{ + try { + SpdkEnv *spdk_env = new SpdkEnv(base_env, dir, conf, bdev, cache_size_in_mb); + if (g_fs != NULL) { + return spdk_env; + } else { + delete spdk_env; + return NULL; + } + } catch (SpdkAppStartException &e) { + SPDK_ERRLOG("NewSpdkEnv: exception caught: %s", e.what()); + return NULL; + } catch (...) { + SPDK_ERRLOG("NewSpdkEnv: default exception caught"); + return NULL; + } +} + +} // namespace rocksdb diff --git a/src/spdk/lib/rocksdb/spdk.rocksdb.mk b/src/spdk/lib/rocksdb/spdk.rocksdb.mk new file mode 100644 index 00000000..2f7a4a86 --- /dev/null +++ b/src/spdk/lib/rocksdb/spdk.rocksdb.mk @@ -0,0 +1,70 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# This snippet will be included into the RocksDB Makefile + +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk +include $(SPDK_ROOT_DIR)/mk/spdk.app.mk +include $(SPDK_ROOT_DIR)/mk/spdk.modules.mk + +CXXFLAGS += -I$(SPDK_DIR)/include -Iinclude/ + +# The SPDK makefiles turn this on, but RocksDB won't compile with it. So +# turn it off after including the SPDK makefiles. +CXXFLAGS += -Wno-missing-declarations + +# The SPDK Makefiles may turn these options on but we do not want to enable +# them for the RocksDB source files. +CXXFLAGS += -fno-profile-arcs -fno-test-coverage +ifeq ($(CONFIG_UBSAN),y) +CXXFLAGS += -fno-sanitize=undefined +endif +ifeq ($(CONFIG_ASAN),y) +CXXFLAGS += -fno-sanitize=address +endif + +SPDK_LIB_LIST = event_bdev event_copy +SPDK_LIB_LIST += blobfs bdev copy event util conf trace \ + log jsonrpc json rpc thread + +AM_LINK += $(COPY_MODULES_LINKER_ARGS) $(BLOCKDEV_MODULES_LINKER_ARGS) +AM_LINK += $(SPDK_LIB_LINKER_ARGS) $(ENV_LINKER_ARGS) +AM_LINK += $(SYS_LIBS) + +ifeq ($(CONFIG_UBSAN),y) +AM_LINK += -fsanitize=undefined +endif + +ifeq ($(CONFIG_COVERAGE),y) +AM_LINK += -fprofile-arcs -ftest-coverage +endif diff --git a/src/spdk/lib/rpc/Makefile b/src/spdk/lib/rpc/Makefile new file mode 100644 index 00000000..024d7a04 --- /dev/null +++ b/src/spdk/lib/rpc/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = rpc.c +LIBNAME = rpc + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/rpc/rpc.c b/src/spdk/lib/rpc/rpc.c new file mode 100644 index 00000000..985d40f4 --- /dev/null +++ b/src/spdk/lib/rpc/rpc.c @@ -0,0 +1,285 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "spdk/stdinc.h" + +#include "spdk/queue.h" +#include "spdk/rpc.h" +#include "spdk/env.h" +#include "spdk/log.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#define RPC_DEFAULT_PORT "5260" + +static struct sockaddr_un g_rpc_listen_addr_unix = {}; +static char g_rpc_lock_path[sizeof(g_rpc_listen_addr_unix.sun_path) + sizeof(".lock")]; +static int g_rpc_lock_fd = -1; + +static struct spdk_jsonrpc_server *g_jsonrpc_server = NULL; +static uint32_t g_rpc_state; + +struct spdk_rpc_method { + const char *name; + spdk_rpc_method_handler func; + SLIST_ENTRY(spdk_rpc_method) slist; + uint32_t state_mask; +}; + +static SLIST_HEAD(, spdk_rpc_method) g_rpc_methods = SLIST_HEAD_INITIALIZER(g_rpc_methods); + +void +spdk_rpc_set_state(uint32_t state) +{ + g_rpc_state = state; +} + +static void +spdk_jsonrpc_handler(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *method, + const struct spdk_json_val *params) +{ + struct spdk_rpc_method *m; + + assert(method != NULL); + + SLIST_FOREACH(m, &g_rpc_methods, slist) { + if (spdk_json_strequal(method, m->name)) { + if ((m->state_mask & g_rpc_state) == g_rpc_state) { + m->func(request, params); + } else { + spdk_jsonrpc_send_error_response_fmt(request, SPDK_JSONRPC_ERROR_INVALID_STATE, + "Method is allowed in any state in the mask (%"PRIx32")," + " but current state is (%"PRIx32")", + m->state_mask, g_rpc_state); + } + return; + } + } + + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_METHOD_NOT_FOUND, "Method not found"); +} + +int +spdk_rpc_listen(const char *listen_addr) +{ + struct addrinfo hints; + struct addrinfo *res; + + memset(&g_rpc_listen_addr_unix, 0, sizeof(g_rpc_listen_addr_unix)); + + if (listen_addr[0] == '/') { + int rc; + + g_rpc_listen_addr_unix.sun_family = AF_UNIX; + rc = snprintf(g_rpc_listen_addr_unix.sun_path, + sizeof(g_rpc_listen_addr_unix.sun_path), + "%s", listen_addr); + if (rc < 0 || (size_t)rc >= sizeof(g_rpc_listen_addr_unix.sun_path)) { + SPDK_ERRLOG("RPC Listen address Unix socket path too long\n"); + g_rpc_listen_addr_unix.sun_path[0] = '\0'; + return -1; + } + + snprintf(g_rpc_lock_path, sizeof(g_rpc_lock_path), "%s.lock", + g_rpc_listen_addr_unix.sun_path); + + g_rpc_lock_fd = open(g_rpc_lock_path, O_RDONLY | O_CREAT, 0600); + if (g_rpc_lock_fd == -1) { + SPDK_ERRLOG("Cannot open lock file %s: %s\n", + g_rpc_lock_path, spdk_strerror(errno)); + return -1; + } + + rc = flock(g_rpc_lock_fd, LOCK_EX | LOCK_NB); + if (rc != 0) { + SPDK_ERRLOG("RPC Unix domain socket path %s in use. Specify another.\n", + g_rpc_listen_addr_unix.sun_path); + return -1; + } + + /* + * Since we acquired the lock, it is safe to delete the Unix socket file + * if it still exists from a previous process. + */ + unlink(g_rpc_listen_addr_unix.sun_path); + + g_jsonrpc_server = spdk_jsonrpc_server_listen(AF_UNIX, 0, + (struct sockaddr *)&g_rpc_listen_addr_unix, + sizeof(g_rpc_listen_addr_unix), + spdk_jsonrpc_handler); + if (g_jsonrpc_server == NULL) { + close(g_rpc_lock_fd); + g_rpc_lock_fd = -1; + unlink(g_rpc_lock_path); + g_rpc_lock_path[0] = '\0'; + } + } else { + char *tmp; + char *host, *port; + + tmp = strdup(listen_addr); + if (!tmp) { + SPDK_ERRLOG("Out of memory\n"); + return -1; + } + + if (spdk_parse_ip_addr(tmp, &host, &port) < 0) { + free(tmp); + SPDK_ERRLOG("Invalid listen address '%s'\n", listen_addr); + return -1; + } + + if (port == NULL) { + port = RPC_DEFAULT_PORT; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = IPPROTO_TCP; + + if (getaddrinfo(host, port, &hints, &res) != 0) { + free(tmp); + SPDK_ERRLOG("Unable to look up RPC listen address '%s'\n", listen_addr); + return -1; + } + + g_jsonrpc_server = spdk_jsonrpc_server_listen(res->ai_family, res->ai_protocol, + res->ai_addr, res->ai_addrlen, + spdk_jsonrpc_handler); + + freeaddrinfo(res); + free(tmp); + } + + if (g_jsonrpc_server == NULL) { + SPDK_ERRLOG("spdk_jsonrpc_server_listen() failed\n"); + return -1; + } + + return 0; +} + +void +spdk_rpc_accept(void) +{ + spdk_jsonrpc_server_poll(g_jsonrpc_server); +} + +void +spdk_rpc_register_method(const char *method, spdk_rpc_method_handler func, uint32_t state_mask) +{ + struct spdk_rpc_method *m; + + m = calloc(1, sizeof(struct spdk_rpc_method)); + assert(m != NULL); + + m->name = strdup(method); + assert(m->name != NULL); + + m->func = func; + m->state_mask = state_mask; + + /* TODO: use a hash table or sorted list */ + SLIST_INSERT_HEAD(&g_rpc_methods, m, slist); +} + +void +spdk_rpc_close(void) +{ + if (g_jsonrpc_server) { + if (g_rpc_listen_addr_unix.sun_path[0]) { + /* Delete the Unix socket file */ + unlink(g_rpc_listen_addr_unix.sun_path); + } + + spdk_jsonrpc_server_shutdown(g_jsonrpc_server); + g_jsonrpc_server = NULL; + + if (g_rpc_lock_fd != -1) { + close(g_rpc_lock_fd); + g_rpc_lock_fd = -1; + } + + if (g_rpc_lock_path[0]) { + unlink(g_rpc_lock_path); + g_rpc_lock_path[0] = '\0'; + } + } +} + +struct rpc_get_rpc_methods { + bool current; +}; + +static const struct spdk_json_object_decoder rpc_get_rpc_methods_decoders[] = { + {"current", offsetof(struct rpc_get_rpc_methods, current), spdk_json_decode_bool, true}, +}; + +static void +spdk_rpc_get_rpc_methods(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_rpc_methods req = {}; + struct spdk_json_write_ctx *w; + struct spdk_rpc_method *m; + + if (params != NULL) { + if (spdk_json_decode_object(params, rpc_get_rpc_methods_decoders, + SPDK_COUNTOF(rpc_get_rpc_methods_decoders), &req)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + SLIST_FOREACH(m, &g_rpc_methods, slist) { + if (req.current && ((m->state_mask & g_rpc_state) != g_rpc_state)) { + continue; + } + spdk_json_write_string(w, m->name); + } + spdk_json_write_array_end(w); + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_rpc_methods", spdk_rpc_get_rpc_methods, SPDK_RPC_STARTUP | SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/scsi/Makefile b/src/spdk/lib/scsi/Makefile new file mode 100644 index 00000000..67cb445a --- /dev/null +++ b/src/spdk/lib/scsi/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = dev.c lun.c port.c scsi.c scsi_bdev.c scsi_rpc.c task.c +LIBNAME = scsi + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/scsi/dev.c b/src/spdk/lib/scsi/dev.c new file mode 100644 index 00000000..335ffacb --- /dev/null +++ b/src/spdk/lib/scsi/dev.c @@ -0,0 +1,415 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +static struct spdk_scsi_dev g_devs[SPDK_SCSI_MAX_DEVS]; + +struct spdk_scsi_dev * +spdk_scsi_dev_get_list(void) +{ + return g_devs; +} + +static struct spdk_scsi_dev * +allocate_dev(void) +{ + struct spdk_scsi_dev *dev; + int i; + + for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) { + dev = &g_devs[i]; + if (!dev->is_allocated) { + memset(dev, 0, sizeof(*dev)); + dev->id = i; + dev->is_allocated = 1; + return dev; + } + } + + return NULL; +} + +static void +free_dev(struct spdk_scsi_dev *dev) +{ + assert(dev->is_allocated == 1); + assert(dev->removed == true); + + dev->is_allocated = 0; +} + +void +spdk_scsi_dev_destruct(struct spdk_scsi_dev *dev) +{ + int lun_cnt; + int i; + + if (dev == NULL || dev->removed) { + return; + } + + dev->removed = true; + lun_cnt = 0; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + + /* + * LUN will remove itself from this dev when all outstanding IO + * is done. When no more LUNs, dev will be deleted. + */ + spdk_scsi_lun_destruct(dev->lun[i]); + lun_cnt++; + } + + if (lun_cnt == 0) { + free_dev(dev); + return; + } +} + +static int +spdk_scsi_dev_find_lowest_free_lun_id(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + return i; + } + } + + return -1; +} + +int +spdk_scsi_dev_add_lun(struct spdk_scsi_dev *dev, const char *bdev_name, int lun_id, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_bdev *bdev; + struct spdk_scsi_lun *lun; + + bdev = spdk_bdev_get_by_name(bdev_name); + if (bdev == NULL) { + SPDK_ERRLOG("device %s: cannot find bdev '%s' (target %d)\n", + dev->name, bdev_name, lun_id); + return -1; + } + + /* Search the lowest free LUN ID if LUN ID is default */ + if (lun_id == -1) { + lun_id = spdk_scsi_dev_find_lowest_free_lun_id(dev); + if (lun_id == -1) { + SPDK_ERRLOG("Free LUN ID is not found\n"); + return -1; + } + } + + lun = spdk_scsi_lun_construct(bdev, hotremove_cb, hotremove_ctx); + if (lun == NULL) { + return -1; + } + + lun->id = lun_id; + lun->dev = dev; + dev->lun[lun_id] = lun; + return 0; +} + +void +spdk_scsi_dev_delete_lun(struct spdk_scsi_dev *dev, + struct spdk_scsi_lun *lun) +{ + int lun_cnt = 0; + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == lun) { + dev->lun[i] = NULL; + } + + if (dev->lun[i]) { + lun_cnt++; + } + } + + if (dev->removed == true && lun_cnt == 0) { + free_dev(dev); + } +} + +/* This typedef exists to work around an astyle 2.05 bug. + * Remove it when astyle is fixed. + */ +typedef struct spdk_scsi_dev _spdk_scsi_dev; + +_spdk_scsi_dev * +spdk_scsi_dev_construct(const char *name, const char *bdev_name_list[], + int *lun_id_list, int num_luns, uint8_t protocol_id, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_scsi_dev *dev; + size_t name_len; + bool found_lun_0; + int i, rc; + + name_len = strlen(name); + if (name_len > sizeof(dev->name) - 1) { + SPDK_ERRLOG("device %s: name longer than maximum allowed length %zu\n", + name, sizeof(dev->name) - 1); + return NULL; + } + + if (num_luns == 0) { + SPDK_ERRLOG("device %s: no LUNs specified\n", name); + return NULL; + } + + found_lun_0 = false; + for (i = 0; i < num_luns; i++) { + if (lun_id_list[i] == 0) { + found_lun_0 = true; + break; + } + } + + if (!found_lun_0) { + SPDK_ERRLOG("device %s: no LUN 0 specified\n", name); + return NULL; + } + + for (i = 0; i < num_luns; i++) { + if (bdev_name_list[i] == NULL) { + SPDK_ERRLOG("NULL spdk_scsi_lun for LUN %d\n", + lun_id_list[i]); + return NULL; + } + } + + dev = allocate_dev(); + if (dev == NULL) { + return NULL; + } + + memcpy(dev->name, name, name_len + 1); + + dev->num_ports = 0; + dev->protocol_id = protocol_id; + + for (i = 0; i < num_luns; i++) { + rc = spdk_scsi_dev_add_lun(dev, bdev_name_list[i], lun_id_list[i], + hotremove_cb, hotremove_ctx); + if (rc < 0) { + spdk_scsi_dev_destruct(dev); + return NULL; + } + } + + return dev; +} + +void +spdk_scsi_dev_queue_mgmt_task(struct spdk_scsi_dev *dev, + struct spdk_scsi_task *task, + enum spdk_scsi_task_func func) +{ + assert(task != NULL); + + task->function = func; + spdk_scsi_lun_task_mgmt_execute(task, func); +} + +void +spdk_scsi_dev_queue_task(struct spdk_scsi_dev *dev, + struct spdk_scsi_task *task) +{ + assert(task != NULL); + + spdk_scsi_lun_execute_task(task->lun, task); +} + +static struct spdk_scsi_port * +spdk_scsi_dev_find_free_port(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + if (!dev->port[i].is_used) { + return &dev->port[i]; + } + } + + return NULL; +} + +int +spdk_scsi_dev_add_port(struct spdk_scsi_dev *dev, uint64_t id, const char *name) +{ + struct spdk_scsi_port *port; + int rc; + + if (dev->num_ports == SPDK_SCSI_DEV_MAX_PORTS) { + SPDK_ERRLOG("device already has %d ports\n", SPDK_SCSI_DEV_MAX_PORTS); + return -1; + } + + port = spdk_scsi_dev_find_port_by_id(dev, id); + if (port != NULL) { + SPDK_ERRLOG("device already has port(%" PRIu64 ")\n", id); + return -1; + } + + port = spdk_scsi_dev_find_free_port(dev); + if (port == NULL) { + assert(false); + return -1; + } + + rc = spdk_scsi_port_construct(port, id, dev->num_ports, name); + if (rc != 0) { + return rc; + } + + dev->num_ports++; + return 0; +} + +int +spdk_scsi_dev_delete_port(struct spdk_scsi_dev *dev, uint64_t id) +{ + struct spdk_scsi_port *port; + + port = spdk_scsi_dev_find_port_by_id(dev, id); + if (port == NULL) { + SPDK_ERRLOG("device does not have specified port(%" PRIu64 ")\n", id); + return -1; + } + + spdk_scsi_port_destruct(port); + + dev->num_ports--; + + return 0; +} + +struct spdk_scsi_port * +spdk_scsi_dev_find_port_by_id(struct spdk_scsi_dev *dev, uint64_t id) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + if (!dev->port[i].is_used) { + continue; + } + if (dev->port[i].id == id) { + return &dev->port[i]; + } + } + + /* No matching port found. */ + return NULL; +} + +void +spdk_scsi_dev_free_io_channels(struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + _spdk_scsi_lun_free_io_channel(dev->lun[i]); + } +} + +int +spdk_scsi_dev_allocate_io_channels(struct spdk_scsi_dev *dev) +{ + int i, rc; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + rc = _spdk_scsi_lun_allocate_io_channel(dev->lun[i]); + if (rc < 0) { + spdk_scsi_dev_free_io_channels(dev); + return -1; + } + } + + return 0; +} + +const char * +spdk_scsi_dev_get_name(const struct spdk_scsi_dev *dev) +{ + return dev->name; +} + +int +spdk_scsi_dev_get_id(const struct spdk_scsi_dev *dev) +{ + return dev->id; +} + +struct spdk_scsi_lun * +spdk_scsi_dev_get_lun(struct spdk_scsi_dev *dev, int lun_id) +{ + if (lun_id < 0 || lun_id >= SPDK_SCSI_DEV_MAX_LUN) { + return NULL; + } + + return dev->lun[lun_id]; +} + +bool +spdk_scsi_dev_has_pending_tasks(const struct spdk_scsi_dev *dev) +{ + int i; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; ++i) { + if (dev->lun[i] && spdk_scsi_lun_has_pending_tasks(dev->lun[i])) { + return true; + } + } + + return false; +} diff --git a/src/spdk/lib/scsi/lun.c b/src/spdk/lib/scsi/lun.c new file mode 100644 index 00000000..ea44d86e --- /dev/null +++ b/src/spdk/lib/scsi/lun.c @@ -0,0 +1,452 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/event.h" +#include "spdk/util.h" + +void +spdk_scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + if (lun) { + TAILQ_REMOVE(&lun->tasks, task, scsi_link); + spdk_trace_record(TRACE_SCSI_TASK_DONE, lun->dev->id, 0, (uintptr_t)task, 0); + } + task->cpl_fn(task); +} + +void +spdk_scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + if (task->function == SPDK_SCSI_TASK_FUNC_LUN_RESET && + task->status == SPDK_SCSI_STATUS_GOOD) { + /* + * The backend LUN device was just reset. If there are active tasks + * in the backend, it means that LUN reset fails, and we set failure + * status to LUN reset task. + */ + if (spdk_scsi_lun_has_pending_tasks(lun)) { + SPDK_ERRLOG("lun->tasks should be empty after reset\n"); + task->response = SPDK_SCSI_TASK_MGMT_RESP_TARGET_FAILURE; + } + } + task->cpl_fn(task); +} + +int +spdk_scsi_lun_task_mgmt_execute(struct spdk_scsi_task *task, + enum spdk_scsi_task_func func) +{ + if (!task) { + return -1; + } + + if (!task->lun) { + /* LUN does not exist */ + task->response = SPDK_SCSI_TASK_MGMT_RESP_INVALID_LUN; + task->cpl_fn(task); + return -1; + } + + switch (func) { + case SPDK_SCSI_TASK_FUNC_ABORT_TASK: + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_ERRLOG("ABORT_TASK failed\n"); + break; + + case SPDK_SCSI_TASK_FUNC_ABORT_TASK_SET: + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + SPDK_ERRLOG("ABORT_TASK_SET failed\n"); + break; + + case SPDK_SCSI_TASK_FUNC_LUN_RESET: + spdk_bdev_scsi_reset(task); + return 0; + + default: + SPDK_ERRLOG("Unknown Task Management Function!\n"); + /* + * Task management functions other than those above should never + * reach this point having been filtered by the frontend. Reject + * the task as being unsupported. + */ + task->response = SPDK_SCSI_TASK_MGMT_RESP_REJECT_FUNC_NOT_SUPPORTED; + break; + } + + spdk_scsi_lun_complete_mgmt_task(task->lun, task); + + return -1; +} + +void +spdk_scsi_task_process_null_lun(struct spdk_scsi_task *task) +{ + uint8_t buffer[36]; + uint32_t allocation_len; + uint32_t data_len; + + task->length = task->transfer_len; + if (task->cdb[0] == SPDK_SPC_INQUIRY) { + /* + * SPC-4 states that INQUIRY commands to an unsupported LUN + * must be served with PERIPHERAL QUALIFIER = 0x3 and + * PERIPHERAL DEVICE TYPE = 0x1F. + */ + data_len = sizeof(buffer); + + memset(buffer, 0, data_len); + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + buffer[0] = 0x03 << 5 | 0x1f; + /* ADDITIONAL LENGTH */ + buffer[4] = data_len - 5; + + allocation_len = from_be16(&task->cdb[3]); + if (spdk_scsi_task_scatter_data(task, buffer, spdk_min(allocation_len, data_len)) >= 0) { + task->data_transferred = data_len; + task->status = SPDK_SCSI_STATUS_GOOD; + } + } else { + /* LOGICAL UNIT NOT SUPPORTED */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_LOGICAL_UNIT_NOT_SUPPORTED, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + task->data_transferred = 0; + } +} + +void +spdk_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task) +{ + int rc; + + task->status = SPDK_SCSI_STATUS_GOOD; + spdk_trace_record(TRACE_SCSI_TASK_START, lun->dev->id, task->length, (uintptr_t)task, 0); + TAILQ_INSERT_TAIL(&lun->tasks, task, scsi_link); + if (!lun->removed) { + rc = spdk_bdev_scsi_execute(task); + } else { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ABORTED_COMMAND, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + rc = SPDK_SCSI_TASK_COMPLETE; + } + + switch (rc) { + case SPDK_SCSI_TASK_PENDING: + break; + + case SPDK_SCSI_TASK_COMPLETE: + spdk_scsi_lun_complete_task(lun, task); + break; + + default: + abort(); + } +} + +static void +spdk_scsi_lun_remove(struct spdk_scsi_lun *lun) +{ + spdk_bdev_close(lun->bdev_desc); + + spdk_scsi_dev_delete_lun(lun->dev, lun); + free(lun); +} + +static int +spdk_scsi_lun_check_io_channel(void *arg) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; + + if (lun->io_channel) { + return -1; + } + spdk_poller_unregister(&lun->hotremove_poller); + + spdk_scsi_lun_remove(lun); + return -1; +} + +static void +spdk_scsi_lun_notify_hot_remove(struct spdk_scsi_lun *lun) +{ + struct spdk_scsi_desc *desc, *tmp; + + if (lun->hotremove_cb) { + lun->hotremove_cb(lun, lun->hotremove_ctx); + } + + TAILQ_FOREACH_SAFE(desc, &lun->open_descs, link, tmp) { + if (desc->hotremove_cb) { + desc->hotremove_cb(lun, desc->hotremove_ctx); + } else { + spdk_scsi_lun_close(desc); + } + } + + if (lun->io_channel) { + lun->hotremove_poller = spdk_poller_register(spdk_scsi_lun_check_io_channel, + lun, 10); + } else { + spdk_scsi_lun_remove(lun); + } +} + +static int +spdk_scsi_lun_check_pending_tasks(void *arg) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)arg; + + if (spdk_scsi_lun_has_pending_tasks(lun)) { + return -1; + } + spdk_poller_unregister(&lun->hotremove_poller); + + spdk_scsi_lun_notify_hot_remove(lun); + return -1; +} + +static void +_spdk_scsi_lun_hot_remove(void *arg1) +{ + struct spdk_scsi_lun *lun = arg1; + + if (spdk_scsi_lun_has_pending_tasks(lun)) { + lun->hotremove_poller = spdk_poller_register(spdk_scsi_lun_check_pending_tasks, + lun, 10); + } else { + spdk_scsi_lun_notify_hot_remove(lun); + } +} + +static void +spdk_scsi_lun_hot_remove(void *remove_ctx) +{ + struct spdk_scsi_lun *lun = (struct spdk_scsi_lun *)remove_ctx; + struct spdk_thread *thread; + + if (lun->removed) { + return; + } + + lun->removed = true; + if (lun->io_channel == NULL) { + _spdk_scsi_lun_hot_remove(lun); + return; + } + + thread = spdk_io_channel_get_thread(lun->io_channel); + if (thread != spdk_get_thread()) { + spdk_thread_send_msg(thread, _spdk_scsi_lun_hot_remove, lun); + } else { + _spdk_scsi_lun_hot_remove(lun); + } +} + +/** + * \brief Constructs a new spdk_scsi_lun object based on the provided parameters. + * + * \param bdev bdev associated with this LUN + * + * \return NULL if bdev == NULL + * \return pointer to the new spdk_scsi_lun object otherwise + */ +_spdk_scsi_lun * +spdk_scsi_lun_construct(struct spdk_bdev *bdev, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx) +{ + struct spdk_scsi_lun *lun; + int rc; + + if (bdev == NULL) { + SPDK_ERRLOG("bdev must be non-NULL\n"); + return NULL; + } + + lun = calloc(1, sizeof(*lun)); + if (lun == NULL) { + SPDK_ERRLOG("could not allocate lun\n"); + return NULL; + } + + rc = spdk_bdev_open(bdev, true, spdk_scsi_lun_hot_remove, lun, &lun->bdev_desc); + + if (rc != 0) { + SPDK_ERRLOG("bdev %s cannot be opened, error=%d\n", spdk_bdev_get_name(bdev), rc); + free(lun); + return NULL; + } + + TAILQ_INIT(&lun->tasks); + + lun->bdev = bdev; + lun->io_channel = NULL; + lun->hotremove_cb = hotremove_cb; + lun->hotremove_ctx = hotremove_ctx; + TAILQ_INIT(&lun->open_descs); + + return lun; +} + +void +spdk_scsi_lun_destruct(struct spdk_scsi_lun *lun) +{ + spdk_scsi_lun_hot_remove(lun); +} + +int +spdk_scsi_lun_open(struct spdk_scsi_lun *lun, spdk_scsi_remove_cb_t hotremove_cb, + void *hotremove_ctx, struct spdk_scsi_desc **_desc) +{ + struct spdk_scsi_desc *desc; + + desc = calloc(1, sizeof(*desc)); + if (desc == NULL) { + SPDK_ERRLOG("calloc() failed for LUN descriptor.\n"); + return -ENOMEM; + } + + TAILQ_INSERT_TAIL(&lun->open_descs, desc, link); + + desc->lun = lun; + desc->hotremove_cb = hotremove_cb; + desc->hotremove_ctx = hotremove_ctx; + *_desc = desc; + + return 0; +} + +void +spdk_scsi_lun_close(struct spdk_scsi_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + TAILQ_REMOVE(&lun->open_descs, desc, link); + free(desc); + + assert(!TAILQ_EMPTY(&lun->open_descs) || lun->io_channel == NULL); +} + +int +_spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun) +{ + if (lun->io_channel != NULL) { + if (spdk_get_thread() == spdk_io_channel_get_thread(lun->io_channel)) { + lun->ref++; + return 0; + } + SPDK_ERRLOG("io_channel already allocated for lun %s\n", + spdk_bdev_get_name(lun->bdev)); + return -1; + } + + lun->io_channel = spdk_bdev_get_io_channel(lun->bdev_desc); + if (lun->io_channel == NULL) { + return -1; + } + lun->ref = 1; + return 0; +} + +void +_spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun *lun) +{ + if (lun->io_channel == NULL) { + return; + } + + if (spdk_get_thread() != spdk_io_channel_get_thread(lun->io_channel)) { + SPDK_ERRLOG("io_channel was freed by different thread\n"); + return; + } + + lun->ref--; + if (lun->ref == 0) { + spdk_put_io_channel(lun->io_channel); + lun->io_channel = NULL; + } +} + +int +spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + return _spdk_scsi_lun_allocate_io_channel(lun); +} + +void +spdk_scsi_lun_free_io_channel(struct spdk_scsi_desc *desc) +{ + struct spdk_scsi_lun *lun = desc->lun; + + _spdk_scsi_lun_free_io_channel(lun); +} + +int +spdk_scsi_lun_get_id(const struct spdk_scsi_lun *lun) +{ + return lun->id; +} + +const char * +spdk_scsi_lun_get_bdev_name(const struct spdk_scsi_lun *lun) +{ + return spdk_bdev_get_name(lun->bdev); +} + +const struct spdk_scsi_dev * +spdk_scsi_lun_get_dev(const struct spdk_scsi_lun *lun) +{ + return lun->dev; +} + +bool +spdk_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun) +{ + return !TAILQ_EMPTY(&lun->tasks); +} + +bool +spdk_scsi_lun_is_removing(const struct spdk_scsi_lun *lun) +{ + return lun->removed; +} diff --git a/src/spdk/lib/scsi/port.c b/src/spdk/lib/scsi/port.c new file mode 100644 index 00000000..70d72004 --- /dev/null +++ b/src/spdk/lib/scsi/port.c @@ -0,0 +1,96 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +struct spdk_scsi_port * +spdk_scsi_port_create(uint64_t id, uint16_t index, const char *name) +{ + struct spdk_scsi_port *port; + + port = calloc(1, sizeof(struct spdk_scsi_port)); + + if (!port) { + return NULL; + } + + if (spdk_scsi_port_construct(port, id, index, name) != 0) { + spdk_scsi_port_free(&port); + return NULL; + } + + return port; +} + +void +spdk_scsi_port_free(struct spdk_scsi_port **pport) +{ + struct spdk_scsi_port *port; + + if (!pport) { + return; + } + + port = *pport; + *pport = NULL; + free(port); +} + +int +spdk_scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, uint16_t index, + const char *name) +{ + if (strlen(name) >= sizeof(port->name)) { + SPDK_ERRLOG("port name too long\n"); + return -1; + } + + port->is_used = 1; + port->id = id; + port->index = index; + snprintf(port->name, sizeof(port->name), "%s", name); + return 0; +} + +void +spdk_scsi_port_destruct(struct spdk_scsi_port *port) +{ + memset(port, 0, sizeof(struct spdk_scsi_port)); +} + +const char * +spdk_scsi_port_get_name(const struct spdk_scsi_port *port) +{ + return port->name; +} diff --git a/src/spdk/lib/scsi/scsi.c b/src/spdk/lib/scsi/scsi.c new file mode 100644 index 00000000..5dce0446 --- /dev/null +++ b/src/spdk/lib/scsi/scsi.c @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +struct spdk_scsi_globals g_spdk_scsi; + +int +spdk_scsi_init(void) +{ + int rc; + + rc = pthread_mutex_init(&g_spdk_scsi.mutex, NULL); + if (rc != 0) { + SPDK_ERRLOG("mutex_init() failed\n"); + return -1; + } + + return 0; +} + +void +spdk_scsi_fini(void) +{ + pthread_mutex_destroy(&g_spdk_scsi.mutex); +} + +SPDK_TRACE_REGISTER_FN(scsi_trace) +{ + spdk_trace_register_owner(OWNER_SCSI_DEV, 'd'); + spdk_trace_register_object(OBJECT_SCSI_TASK, 't'); + spdk_trace_register_description("SCSI_TASK_DONE", "", TRACE_SCSI_TASK_DONE, + OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, ""); + spdk_trace_register_description("SCSI_TASK_START", "", TRACE_SCSI_TASK_START, + OWNER_SCSI_DEV, OBJECT_SCSI_TASK, 0, 0, ""); +} + +SPDK_LOG_REGISTER_COMPONENT("scsi", SPDK_LOG_SCSI) diff --git a/src/spdk/lib/scsi/scsi_bdev.c b/src/spdk/lib/scsi/scsi_bdev.c new file mode 100644 index 00000000..289d8626 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_bdev.c @@ -0,0 +1,2116 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +/* + * TODO: move bdev SCSI error code translation tests to bdev unit test + * and remove this include. + */ +#include "spdk/bdev_module.h" + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#define SPDK_WORK_BLOCK_SIZE (4ULL * 1024ULL * 1024ULL) +#define SPDK_WORK_ATS_BLOCK_SIZE (1ULL * 1024ULL * 1024ULL) +#define MAX_SERIAL_STRING 32 + +#define DEFAULT_DISK_VENDOR "INTEL" +#define DEFAULT_DISK_REVISION "0001" +#define DEFAULT_DISK_ROTATION_RATE 1 /* Non-rotating medium */ +#define DEFAULT_DISK_FORM_FACTOR 0x02 /* 3.5 inch */ +#define DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT 256 + +#define INQUIRY_OFFSET(field) offsetof(struct spdk_scsi_cdb_inquiry_data, field) + \ + sizeof(((struct spdk_scsi_cdb_inquiry_data *)0x0)->field) + +static void spdk_bdev_scsi_process_block_resubmit(void *arg); + +static int +spdk_hex2bin(char ch) +{ + if ((ch >= '0') && (ch <= '9')) { + return ch - '0'; + } + ch = tolower(ch); + if ((ch >= 'a') && (ch <= 'f')) { + return ch - 'a' + 10; + } + return (int)ch; +} + +static void +spdk_bdev_scsi_set_naa_ieee_extended(const char *name, uint8_t *buf) +{ + int i, value, count = 0; + uint64_t local_value; + + for (i = 0; (i < 16) && (name[i] != '\0'); i++) { + value = spdk_hex2bin(name[i]); + if (i % 2) { + buf[count++] |= value << 4; + } else { + buf[count] = value; + } + } + + local_value = *(uint64_t *)buf; + /* + * see spc3r23 7.6.3.6.2, + * NAA IEEE Extended identifer format + */ + local_value &= 0x0fff000000ffffffull; + /* NAA 02, and 00 03 47 for IEEE Intel */ + local_value |= 0x2000000347000000ull; + + to_be64((void *)buf, local_value); +} + +static int +spdk_bdev_scsi_report_luns(struct spdk_scsi_lun *lun, + int sel, uint8_t *data, int alloc_len) +{ + struct spdk_scsi_dev *dev; + uint64_t fmt_lun, lun_id, method; + int hlen, len = 0; + int i; + + if (alloc_len < 8) { + return -1; + } + + if (sel == 0x00) { + /* logical unit with addressing method */ + } else if (sel == 0x01) { + /* well known logical unit */ + } else if (sel == 0x02) { + /* logical unit */ + } else { + return -1; + } + + /* LUN LIST LENGTH */ + memset(data, 0, 4); + + /* Reserved */ + memset(&data[4], 0, 4); + hlen = 8; + + dev = lun->dev; + + for (i = 0; i < SPDK_SCSI_DEV_MAX_LUN; i++) { + if (dev->lun[i] == NULL) { + continue; + } + + if (alloc_len - (hlen + len) < 8) { + return -1; + } + + lun_id = (uint64_t)i; + + if (SPDK_SCSI_DEV_MAX_LUN <= 0x0100) { + /* below 256 */ + method = 0x00U; + fmt_lun = (method & 0x03U) << 62; + fmt_lun |= (lun_id & 0x00ffU) << 48; + } else if (SPDK_SCSI_DEV_MAX_LUN <= 0x4000) { + /* below 16384 */ + method = 0x01U; + fmt_lun = (method & 0x03U) << 62; + fmt_lun |= (lun_id & 0x3fffU) << 48; + } else { + /* XXX */ + fmt_lun = 0; + } + + /* LUN */ + to_be64(&data[hlen + len], fmt_lun); + len += 8; + } + + /* LUN LIST LENGTH */ + to_be32(data, len); + + return hlen + len; +} + +static int +spdk_bdev_scsi_pad_scsi_name(char *dst, const char *name) +{ + size_t len; + + len = strlen(name); + memcpy(dst, name, len); + do { + dst[len++] = '\0'; + } while (len & 3); + + return len; +} + +static int +spdk_bdev_scsi_inquiry(struct spdk_bdev *bdev, struct spdk_scsi_task *task, + uint8_t *cdb, uint8_t *data, uint16_t alloc_len) +{ + struct spdk_scsi_lun *lun; + struct spdk_scsi_dev *dev; + struct spdk_scsi_port *port; + uint32_t blocks, optimal_blocks; + int hlen = 0, plen, plen2; + uint16_t len = 0; + int pc; + int pd; + int evpd; + int i; + struct spdk_scsi_cdb_inquiry *inq = (struct spdk_scsi_cdb_inquiry *)cdb; + + /* standard inquiry command at lease with 36 Bytes */ + if (alloc_len < 0x24) { + goto inq_error; + } + + lun = task->lun; + dev = lun->dev; + port = task->target_port; + + pd = SPDK_SPC_PERIPHERAL_DEVICE_TYPE_DISK; + pc = inq->page_code; + evpd = inq->evpd & 0x1; + + if (!evpd && pc) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + if (evpd) { + struct spdk_scsi_vpd_page *vpage = (struct spdk_scsi_vpd_page *)data; + + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + vpage->peripheral_device_type = pd; + vpage->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED; + /* PAGE CODE */ + vpage->page_code = pc; + + /* Vital product data */ + switch (pc) { + case SPDK_SPC_VPD_SUPPORTED_VPD_PAGES: + hlen = 4; + + vpage->params[0] = SPDK_SPC_VPD_SUPPORTED_VPD_PAGES; + vpage->params[1] = SPDK_SPC_VPD_UNIT_SERIAL_NUMBER; + vpage->params[2] = SPDK_SPC_VPD_DEVICE_IDENTIFICATION; + vpage->params[3] = SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES; + vpage->params[4] = SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA; + vpage->params[5] = SPDK_SPC_VPD_MODE_PAGE_POLICY; + vpage->params[6] = SPDK_SPC_VPD_SCSI_PORTS; + vpage->params[7] = SPDK_SPC_VPD_BLOCK_LIMITS; + vpage->params[8] = SPDK_SPC_VPD_BLOCK_DEV_CHARS; + len = 9; + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + vpage->params[9] = SPDK_SPC_VPD_BLOCK_THIN_PROVISION; + len++; + } + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + + case SPDK_SPC_VPD_UNIT_SERIAL_NUMBER: { + const char *name = spdk_bdev_get_name(bdev); + + hlen = 4; + + /* PRODUCT SERIAL NUMBER */ + len = strlen(name) + 1; + if (len > MAX_SERIAL_STRING) { + len = MAX_SERIAL_STRING; + } + + memcpy(vpage->params, name, len - 1); + vpage->params[len - 1] = 0; + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_DEVICE_IDENTIFICATION: { + const char *name = spdk_bdev_get_name(bdev); + const char *product_name = spdk_bdev_get_product_name(bdev); + uint8_t protocol_id = dev->protocol_id; + uint8_t *buf = vpage->params; + struct spdk_scsi_desig_desc *desig; + + hlen = 4; + + /* Check total length by calculated how much space all entries take */ + len = sizeof(struct spdk_scsi_desig_desc) + 8; + len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING; + len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_DEV_MAX_NAME + 1; + len += sizeof(struct spdk_scsi_desig_desc) + SPDK_SCSI_PORT_MAX_NAME_LENGTH; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + len += sizeof(struct spdk_scsi_desig_desc) + 4; + if (sizeof(struct spdk_scsi_vpd_page) + len > alloc_len) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + /* Now fill out the designator array */ + + /* NAA designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_NAA; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 8; + spdk_bdev_scsi_set_naa_ieee_extended(name, desig->desig); + len = sizeof(struct spdk_scsi_desig_desc) + 8; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* T10 Vendor ID designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_ASCII; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_T10_VENDOR_ID; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 8 + 16 + MAX_SERIAL_STRING; + spdk_strcpy_pad(desig->desig, DEFAULT_DISK_VENDOR, 8, ' '); + spdk_strcpy_pad(&desig->desig[8], product_name, 16, ' '); + spdk_strcpy_pad(&desig->desig[24], name, MAX_SERIAL_STRING, ' '); + len += sizeof(struct spdk_scsi_desig_desc) + 8 + 16 + MAX_SERIAL_STRING; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* SCSI Device Name designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_DEVICE; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = spdk_bdev_scsi_pad_scsi_name(desig->desig, dev->name); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* SCSI Port Name designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_UTF8; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = snprintf(desig->desig, SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", port->name); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Relative Target Port designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_RELATIVE_TARGET_PORT; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 2); /* Reserved */ + to_be16(&desig->desig[2], port->index); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Target port group designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_TARGET_PORT_GROUP; + desig->association = SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 4); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + buf += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + /* Logical unit group designator */ + desig = (struct spdk_scsi_desig_desc *)buf; + desig->code_set = SPDK_SPC_VPD_CODE_SET_BINARY; + desig->protocol_id = protocol_id; + desig->type = SPDK_SPC_VPD_IDENTIFIER_TYPE_LOGICAL_UNIT_GROUP; + desig->association = SPDK_SPC_VPD_ASSOCIATION_LOGICAL_UNIT; + desig->reserved0 = 0; + desig->piv = 1; + desig->reserved1 = 0; + desig->len = 4; + memset(desig->desig, 0, 2); /* Reserved */ + to_be16(&desig->desig[2], dev->id); + len += sizeof(struct spdk_scsi_desig_desc) + desig->len; + + to_be16(vpage->alloc_len, len); + + break; + } + + case SPDK_SPC_VPD_EXTENDED_INQUIRY_DATA: { + struct spdk_scsi_vpd_ext_inquiry *vext = (struct spdk_scsi_vpd_ext_inquiry *)vpage; + + memset(vext, 0, sizeof(*vext)); + hlen = 4; + + /* RTO(3) GRD_CHK(2) APP_CHK(1) REF_CHK(0) */ + + /* GROUP_SUP(4) PRIOR_SUP(3) HEADSUP(2) ORDSUP(1) SIMPSUP(0) */ + vext->sup = SPDK_SCSI_VEXT_HEADSUP | SPDK_SCSI_VEXT_SIMPSUP; + + /* NV_SUP(1) V_SUP(0) */ + + /* Reserved[7-63] */ + + len = 64 - hlen; + + /* PAGE LENGTH */ + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_MANAGEMENT_NETWORK_ADDRESSES: + /* PAGE LENGTH */ + hlen = 4; + + to_be16(vpage->alloc_len, len); + break; + + case SPDK_SPC_VPD_MODE_PAGE_POLICY: { + struct spdk_scsi_mpage_policy_desc *pdesc = + (struct spdk_scsi_mpage_policy_desc *)vpage->params; + + hlen = 4; + + /* Mode page policy descriptor 1 */ + + /* POLICY PAGE CODE(5-0) */ + /* all page code */ + pdesc->page_code = 0x3f; + + /* POLICY SUBPAGE CODE */ + /* all sub page */ + pdesc->sub_page_code = 0xff; + + /* MLUS(7) MODE PAGE POLICY(1-0) */ + /* MLUS own copy */ + /* Shared MODE PAGE policy */ + pdesc->policy = 0; + /* Reserved */ + pdesc->reserved = 0; + + len += 4; + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_SCSI_PORTS: { + /* PAGE LENGTH */ + hlen = 4; + + /* Identification descriptor list */ + for (i = 0; i < SPDK_SCSI_DEV_MAX_PORTS; i++) { + struct spdk_scsi_port_desc *sdesc; + struct spdk_scsi_tgt_port_desc *pdesc; + + if (!dev->port[i].is_used) { + continue; + } + + /* Identification descriptor N */ + sdesc = (struct spdk_scsi_port_desc *)&vpage->params[len]; + + /* Reserved */ + sdesc->reserved = 0; + + /* RELATIVE PORT IDENTIFIER */ + to_be16(&sdesc->rel_port_id, dev->port[i].index); + + /* Reserved */ + sdesc->reserved2 = 0; + + /* INITIATOR PORT TRANSPORTID LENGTH */ + sdesc->init_port_len = 0; + + /* Reserved */ + sdesc->init_port_id = 0; + + /* TARGET PORT DESCRIPTORS LENGTH */ + sdesc->tgt_desc_len = 0; + + len += 12; + + plen2 = 0; + /* Target port descriptor 1 */ + pdesc = (struct spdk_scsi_tgt_port_desc *)sdesc->tgt_desc; + + /* PROTOCOL IDENTIFIER(7-4) CODE SET(3-0) */ + pdesc->code_set = + SPDK_SPC_PROTOCOL_IDENTIFIER_ISCSI << 4 | + SPDK_SPC_VPD_CODE_SET_UTF8; + + /* PIV(7) ASSOCIATION(5-4) IDENTIFIER TYPE(3-0) */ + pdesc->desig_type = SPDK_SPC_VPD_DESIG_PIV | + SPDK_SPC_VPD_ASSOCIATION_TARGET_PORT << 4 | + SPDK_SPC_VPD_IDENTIFIER_TYPE_SCSI_NAME; + + /* Reserved */ + pdesc->reserved = 0; + + /* IDENTIFIER */ + plen = snprintf((char *)pdesc->designator, + SPDK_SCSI_PORT_MAX_NAME_LENGTH, "%s", + dev->port[i].name); + pdesc->len = plen; + + plen2 += 4 + plen; + + /* TARGET PORT DESCRIPTORS LENGTH */ + to_be16(&sdesc->tgt_desc_len, plen2); + + len += plen2; + } + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_LIMITS: { + uint32_t block_size = spdk_bdev_get_block_size(bdev); + + /* PAGE LENGTH */ + memset(&data[4], 0, 60); + + hlen = 4; + + /* WSNZ(0) */ + /* support zero length in WRITE SAME */ + + /* MAXIMUM COMPARE AND WRITE LENGTH */ + blocks = SPDK_WORK_ATS_BLOCK_SIZE / block_size; + + if (blocks > 0xff) { + blocks = 0xff; + } + + data[5] = (uint8_t)blocks; + + /* force align to 4KB */ + if (block_size < 4096) { + optimal_blocks = 4096 / block_size; + } else { + optimal_blocks = 1; + } + + /* OPTIMAL TRANSFER LENGTH GRANULARITY */ + to_be16(&data[6], optimal_blocks); + + blocks = SPDK_WORK_BLOCK_SIZE / block_size; + + /* MAXIMUM TRANSFER LENGTH */ + to_be32(&data[8], blocks); + /* OPTIMAL TRANSFER LENGTH */ + to_be32(&data[12], blocks); + + /* MAXIMUM PREFETCH XDREAD XDWRITE TRANSFER LENGTH */ + + len = 20 - hlen; + + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + /* + * MAXIMUM UNMAP LBA COUNT: indicates the + * maximum number of LBAs that may be + * unmapped by an UNMAP command. + */ + /* For now, choose 4MB as the maximum. */ + to_be32(&data[20], 4194304); + + /* + * MAXIMUM UNMAP BLOCK DESCRIPTOR COUNT: + * indicates the maximum number of UNMAP + * block descriptors that shall be contained + * in the parameter data transferred to the + * device server for an UNMAP command. + * The bdev layer automatically splits unmap + * requests, so pick an arbitrary high number here. + */ + to_be32(&data[24], DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT); + + /* + * The UGAVALID bit is left as 0 which means neither the + * OPTIMAL UNMAP GRANULARITY nor the UNMAP GRANULARITY + * ALIGNMENT fields are valid. + */ + + /* + * MAXIMUM WRITE SAME LENGTH: indicates the + * maximum number of contiguous logical blocks + * that the device server allows to be unmapped + * or written in a single WRITE SAME command. + */ + to_be64(&data[36], 512); + + /* Reserved */ + /* not specified */ + len = 64 - hlen; + } + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_DEV_CHARS: { + /* PAGE LENGTH */ + hlen = 4; + len = 64 - hlen; + + to_be16(&data[4], DEFAULT_DISK_ROTATION_RATE); + + /* Reserved */ + data[6] = 0; + /* NOMINAL FORM FACTOR(3-0) */ + data[7] = DEFAULT_DISK_FORM_FACTOR << 4; + /* Reserved */ + memset(&data[8], 0, 64 - 8); + + to_be16(vpage->alloc_len, len); + break; + } + + case SPDK_SPC_VPD_BLOCK_THIN_PROVISION: { + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + SPDK_ERRLOG("unsupported INQUIRY VPD page 0x%x\n", pc); + goto inq_error; + } + + hlen = 4; + len = 7; + + /* + * PAGE LENGTH : if the DP bit is set to one, then the + * page length shall be set 0004h. + */ + to_be16(&data[2], 0x0004); + + /* + * THRESHOLD EXPONENT : it indicates the threshold set + * size in LBAs as a power of 2( i.e., the threshold + * set size = 2 ^ (threshold exponent). + */ + data[4] = 0; + + /* + * Set the LBPU bit to indicate the support for UNMAP + * command. + */ + data[5] |= SPDK_SCSI_UNMAP_LBPU; + + /* + * Set the provisioning type to thin provision. + */ + data[6] = SPDK_SCSI_UNMAP_THIN_PROVISIONING; + + to_be16(vpage->alloc_len, len); + break; + } + + default: + if (pc >= 0xc0 && pc <= 0xff) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "Vendor specific INQUIRY VPD page 0x%x\n", pc); + } else { + SPDK_ERRLOG("unsupported INQUIRY VPD page 0x%x\n", pc); + } + goto inq_error; + } + } else { + struct spdk_scsi_cdb_inquiry_data *inqdata = + (struct spdk_scsi_cdb_inquiry_data *)data; + + /* Standard INQUIRY data */ + /* PERIPHERAL QUALIFIER(7-5) PERIPHERAL DEVICE TYPE(4-0) */ + inqdata->peripheral_device_type = pd; + inqdata->peripheral_qualifier = SPDK_SPC_PERIPHERAL_QUALIFIER_CONNECTED; + + /* RMB(7) */ + inqdata->rmb = 0; + + /* VERSION */ + /* See SPC3/SBC2/MMC4/SAM2 for more details */ + inqdata->version = SPDK_SPC_VERSION_SPC3; + + /* NORMACA(5) HISUP(4) RESPONSE DATA FORMAT(3-0) */ + /* format 2 */ /* hierarchical support */ + inqdata->response = 2 | 1 << 4; + + hlen = 5; + + /* SCCS(7) ACC(6) TPGS(5-4) 3PC(3) PROTECT(0) */ + /* Not support TPGS */ + inqdata->flags = 0; + + /* MULTIP */ + inqdata->flags2 = 0x10; + + /* WBUS16(5) SYNC(4) LINKED(3) CMDQUE(1) VS(0) */ + /* CMDQUE */ + inqdata->flags3 = 0x2; + + /* T10 VENDOR IDENTIFICATION */ + spdk_strcpy_pad(inqdata->t10_vendor_id, DEFAULT_DISK_VENDOR, 8, ' '); + + /* PRODUCT IDENTIFICATION */ + spdk_strcpy_pad(inqdata->product_id, spdk_bdev_get_product_name(bdev), 16, ' '); + + /* PRODUCT REVISION LEVEL */ + spdk_strcpy_pad(inqdata->product_rev, DEFAULT_DISK_REVISION, 4, ' '); + + /* + * Standard inquiry data ends here. Only populate remaining fields if alloc_len + * indicates enough space to hold it. + */ + len = INQUIRY_OFFSET(product_rev) - 5; + + if (alloc_len >= INQUIRY_OFFSET(vendor)) { + /* Vendor specific */ + memset(inqdata->vendor, 0x20, 20); + len += sizeof(inqdata->vendor); + } + + if (alloc_len >= INQUIRY_OFFSET(ius)) { + /* CLOCKING(3-2) QAS(1) IUS(0) */ + inqdata->ius = 0; + len += sizeof(inqdata->ius); + } + + if (alloc_len >= INQUIRY_OFFSET(reserved)) { + /* Reserved */ + inqdata->reserved = 0; + len += sizeof(inqdata->reserved); + } + + /* VERSION DESCRIPTOR 1-8 */ + if (alloc_len >= INQUIRY_OFFSET(reserved) + 2) { + to_be16(&inqdata->desc[0], 0x0960); + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 4) { + to_be16(&inqdata->desc[2], 0x0300); /* SPC-3 (no version claimed) */ + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 6) { + to_be16(&inqdata->desc[4], 0x320); /* SBC-2 (no version claimed) */ + len += 2; + } + + if (alloc_len >= INQUIRY_OFFSET(reserved) + 8) { + to_be16(&inqdata->desc[6], 0x0040); /* SAM-2 (no version claimed) */ + len += 2; + } + + /* + * We only fill out 4 descriptors, but if the allocation length goes past + * that, zero the remaining bytes. This fixes some SCSI compliance tests + * which expect a full 96 bytes to be returned, including the unpopulated + * version descriptors 5-8 (4 * 2 = 8 bytes) plus the 22 bytes of reserved + * space (bytes 74-95) - for a total of 30 bytes. + */ + if (alloc_len > INQUIRY_OFFSET(reserved) + 8) { + i = alloc_len - (INQUIRY_OFFSET(reserved) + 8); + if (i > 30) { + i = 30; + } + memset(&inqdata->desc[8], 0, i); + len += i; + } + + /* ADDITIONAL LENGTH */ + inqdata->add_len = len; + } + + return hlen + len; + +inq_error: + task->data_transferred = 0; + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} + +static void +mode_sense_page_init(uint8_t *buf, int len, int page, int subpage) +{ + if (!buf) { + return; + } + + memset(buf, 0, len); + if (subpage != 0) { + buf[0] = page | 0x40; /* PAGE + SPF=1 */ + buf[1] = subpage; + to_be16(&buf[2], len - 4); + } else { + buf[0] = page; + buf[1] = len - 2; + } +} + +static int +spdk_bdev_scsi_mode_sense_page(struct spdk_bdev *bdev, + uint8_t *cdb, int pc, int page, int subpage, + uint8_t *data, struct spdk_scsi_task *task) +{ + uint8_t *cp = data; + int len = 0; + int plen; + int i; + + if (pc == 0x00) { + /* Current values */ + } else if (pc == 0x01) { + /* Changeable values */ + /* As we currently do not support changeable values, + all parameters are reported as zero. */ + } else if (pc == 0x02) { + /* Default values */ + } else { + /* Saved values not supported */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_SAVING_PARAMETERS_NOT_SUPPORTED, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + switch (page) { + case 0x00: + /* Vendor specific */ + break; + case 0x01: + /* Read-Write Error Recovery */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Read-Write Error Recovery\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x02: + /* Disconnect-Reconnect */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Disconnect-Reconnect\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0e + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x03: + /* Obsolete (Format Device) */ + break; + case 0x04: + /* Obsolete (Rigid Disk Geometry) */ + break; + case 0x05: + /* Obsolete (Rigid Disk Geometry) */ + break; + case 0x06: + /* Reserved */ + break; + case 0x07: + /* Verify Error Recovery */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Verify Error Recovery\n"); + + if (subpage != 0x00) { + break; + } + + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x08: { + /* Caching */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE Caching\n"); + if (subpage != 0x00) { + break; + } + + plen = 0x12 + 2; + mode_sense_page_init(cp, plen, page, subpage); + + if (cp && spdk_bdev_has_write_cache(bdev) && pc != 0x01) { + cp[2] |= 0x4; /* WCE */ + } + + /* Read Cache Disable (RCD) = 1 */ + if (cp && pc != 0x01) { + cp[2] |= 0x1; + } + + len += plen; + break; + } + case 0x09: + /* Obsolete */ + break; + case 0x0a: + switch (subpage) { + case 0x00: + /* Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Control\n"); + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x01: + /* Control Extension */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Control Extension\n"); + plen = 0x1c + 4; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0xff: + /* All subpages */ + len += spdk_bdev_scsi_mode_sense_page(bdev, + cdb, pc, page, + 0x00, + cp ? &cp[len] : NULL, task); + len += spdk_bdev_scsi_mode_sense_page(bdev, + cdb, pc, page, + 0x01, + cp ? &cp[len] : NULL, task); + break; + default: + /* 0x02-0x3e: Reserved */ + break; + } + break; + case 0x0b: + /* Obsolete (Medium Types Supported) */ + break; + case 0x0c: + /* Obsolete (Notch And Partitio) */ + break; + case 0x0d: + /* Obsolete */ + break; + case 0x0e: + case 0x0f: + /* Reserved */ + break; + case 0x10: + /* XOR Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SENSE XOR Control\n"); + if (subpage != 0x00) { + break; + } + plen = 0x16 + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x11: + case 0x12: + case 0x13: + /* Reserved */ + break; + case 0x14: + /* Enclosure Services Management */ + break; + case 0x15: + case 0x16: + case 0x17: + /* Reserved */ + break; + case 0x18: + /* Protocol-Specific LUN */ + break; + case 0x19: + /* Protocol-Specific Port */ + break; + case 0x1a: + /* Power Condition */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Power Condition\n"); + if (subpage != 0x00) { + break; + } + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x1b: + /* Reserved */ + break; + case 0x1c: + /* Informational Exceptions Control */ + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "MODE_SENSE Informational Exceptions Control\n"); + if (subpage != 0x00) { + break; + } + + plen = 0x0a + 2; + mode_sense_page_init(cp, plen, page, subpage); + len += plen; + break; + case 0x1d: + case 0x1e: + case 0x1f: + /* Reserved */ + break; + case 0x20: + case 0x21: + case 0x22: + case 0x23: + case 0x24: + case 0x25: + case 0x26: + case 0x27: + case 0x28: + case 0x29: + case 0x2a: + case 0x2b: + case 0x2c: + case 0x2d: + case 0x2e: + case 0x2f: + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3a: + case 0x3b: + case 0x3c: + case 0x3d: + case 0x3e: + /* Vendor-specific */ + break; + case 0x3f: + switch (subpage) { + case 0x00: + /* All mode pages */ + for (i = 0x00; i < 0x3e; i ++) { + len += spdk_bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0x00, + cp ? &cp[len] : NULL, task); + } + break; + case 0xff: + /* All mode pages and subpages */ + for (i = 0x00; i < 0x3e; i ++) { + len += spdk_bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0x00, + cp ? &cp[len] : NULL, task); + } + for (i = 0x00; i < 0x3e; i ++) { + len += spdk_bdev_scsi_mode_sense_page( + bdev, cdb, pc, i, 0xff, + cp ? &cp[len] : NULL, task); + } + break; + default: + /* 0x01-0x3e: Reserved */ + break; + } + } + + return len; +} + +static int +spdk_bdev_scsi_mode_sense(struct spdk_bdev *bdev, int md, + uint8_t *cdb, int dbd, int llbaa, int pc, + int page, int subpage, uint8_t *data, struct spdk_scsi_task *task) +{ + uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + uint8_t *hdr, *bdesc, *pages; + int hlen; + int blen; + int plen, total; + + assert(md == 6 || md == 10); + + if (md == 6) { + hlen = 4; + blen = 8; /* For MODE SENSE 6 only short LBA */ + } else { + hlen = 8; + blen = llbaa ? 16 : 8; + } + + if (dbd) { + blen = 0; + } + + pages = data ? &data[hlen + blen] : NULL; + plen = spdk_bdev_scsi_mode_sense_page(bdev, cdb, pc, page, + subpage, + pages, task); + if (plen < 0) { + return -1; + } + + total = hlen + blen + plen; + if (data == NULL) { + return total; + } + + hdr = &data[0]; + if (hlen == 4) { + hdr[0] = total - 1; /* Mode Data Length */ + hdr[1] = 0; /* Medium Type */ + hdr[2] = 0; /* Device-Specific Parameter */ + hdr[3] = blen; /* Block Descripter Length */ + } else { + to_be16(&hdr[0], total - 2); /* Mode Data Length */ + hdr[2] = 0; /* Medium Type */ + hdr[3] = 0; /* Device-Specific Parameter */ + hdr[4] = llbaa ? 0x1 : 0; /* Long/short LBA */ + hdr[5] = 0; /* Reserved */ + to_be16(&hdr[6], blen); /* Block Descripter Length */ + } + + bdesc = &data[hlen]; + if (blen == 16) { + /* Number of Blocks */ + to_be64(&bdesc[0], num_blocks); + /* Reserved */ + memset(&bdesc[8], 0, 4); + /* Block Length */ + to_be32(&bdesc[12], block_size); + } else if (blen == 8) { + /* Number of Blocks */ + if (num_blocks > 0xffffffffULL) { + memset(&bdesc[0], 0xff, 4); + } else { + to_be32(&bdesc[0], num_blocks); + } + + /* Block Length */ + to_be32(&bdesc[4], block_size); + } + + return total; +} + +static int +spdk_bdev_scsi_mode_select_page(struct spdk_bdev *bdev, + uint8_t *cdb, int pf, int sp, + uint8_t *data, size_t len) +{ + size_t hlen, plen; + int spf, page, subpage; + int rc; + + /* vendor specific */ + if (pf == 0) { + return 0; + } + + if (len < 1) { + return 0; + } + + spf = !!(data[0] & 0x40); + page = data[0] & 0x3f; + if (spf) { + /* Sub_page mode page format */ + hlen = 4; + if (len < hlen) { + return 0; + } + subpage = data[1]; + + plen = from_be16(&data[2]); + } else { + /* Page_0 mode page format */ + hlen = 2; + if (len < hlen) { + return 0; + } + subpage = 0; + plen = data[1]; + } + + plen += hlen; + if (len < plen) { + return 0; + } + + switch (page) { + case 0x08: { /* Caching */ + //int wce; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SELECT Caching\n"); + if (subpage != 0x00) { + break; + } + + if (plen != 0x12 + hlen) { + /* unknown format */ + break; + } + + // TODO: + //wce = data[2] & 0x4; /* WCE */ + + //fd = bdev->fd; + // + //rc = fcntl(fd, F_GETFL, 0); + //if (rc != -1) { + // if (wce) { + // SPDK_DEBUGLOG(SPDK_LOG_SCSI, "MODE_SELECT Writeback cache enable\n"); + // rc = fcntl(fd, F_SETFL, (rc & ~O_FSYNC)); + // bdev->write_cache = 1; + // } else { + // rc = fcntl(fd, F_SETFL, (rc | O_FSYNC)); + // bdev->write_cache = 0; + // } + //} + + break; + } + default: + /* not supported */ + break; + } + + len -= plen; + if (len != 0) { + rc = spdk_bdev_scsi_mode_select_page(bdev, cdb, pf, sp, &data[plen], len); + if (rc < 0) { + return rc; + } + } + return 0; +} + +static void +spdk_bdev_scsi_task_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_scsi_task *task = cb_arg; + int sc, sk, asc, ascq; + + task->bdev_io = bdev_io; + + spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq); + spdk_scsi_task_set_status(task, sc, sk, asc, ascq); + spdk_scsi_lun_complete_task(task->lun, task); +} + +static void +spdk_bdev_scsi_task_complete_mgmt(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_scsi_task *task = cb_arg; + + task->bdev_io = bdev_io; + + if (success) { + task->response = SPDK_SCSI_TASK_MGMT_RESP_SUCCESS; + } + + spdk_scsi_lun_complete_mgmt_task(task->lun, task); +} + +static void +spdk_bdev_scsi_queue_io(struct spdk_scsi_task *task, spdk_bdev_io_wait_cb cb_fn, void *cb_arg) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + struct spdk_io_channel *ch = lun->io_channel; + int rc; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = cb_fn; + task->bdev_io_wait.cb_arg = cb_arg; + + rc = spdk_bdev_queue_io_wait(bdev, ch, &task->bdev_io_wait); + if (rc != 0) { + assert(false); + } +} + +static int +spdk_bdev_scsi_read(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + uint64_t lba, uint32_t len) +{ + uint64_t blen; + uint64_t offset; + uint64_t nbytes; + int rc; + + blen = spdk_bdev_get_block_size(bdev); + + lba += (task->offset / blen); + offset = lba * blen; + nbytes = task->length; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "Read: lba=%"PRIu64", len=%"PRIu64"\n", + lba, (uint64_t)task->length / blen); + + rc = spdk_bdev_readv(bdev_desc, bdev_ch, task->iovs, + task->iovcnt, offset, nbytes, + spdk_bdev_scsi_task_complete_cmd, task); + + if (rc) { + if (rc == -ENOMEM) { + spdk_bdev_scsi_queue_io(task, spdk_bdev_scsi_process_block_resubmit, task); + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("spdk_bdev_readv() failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + task->data_transferred = nbytes; + return SPDK_SCSI_TASK_PENDING; +} + +static int +spdk_bdev_scsi_write(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + uint64_t lba, uint32_t len) +{ + uint64_t blen; + uint64_t offset; + uint64_t nbytes; + int rc; + + blen = spdk_bdev_get_block_size(bdev); + offset = lba * blen; + nbytes = ((uint64_t)len) * blen; + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, + "Write: lba=%"PRIu64", len=%u\n", + lba, len); + + if (nbytes > task->transfer_len) { + SPDK_ERRLOG("nbytes(%zu) > transfer_len(%u)\n", + (size_t)nbytes, task->transfer_len); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + offset += task->offset; + rc = spdk_bdev_writev(bdev_desc, bdev_ch, task->iovs, + task->iovcnt, offset, task->length, + spdk_bdev_scsi_task_complete_cmd, + task); + + if (rc) { + if (rc == -ENOMEM) { + spdk_bdev_scsi_queue_io(task, spdk_bdev_scsi_process_block_resubmit, task); + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("spdk_bdev_writev failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "Wrote %"PRIu64"/%"PRIu64" bytes\n", + (uint64_t)task->length, nbytes); + + task->data_transferred = task->length; + return SPDK_SCSI_TASK_PENDING; +} + +static int +spdk_bdev_scsi_sync(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, + uint64_t lba, uint32_t num_blocks) +{ + uint64_t bdev_num_blocks; + int rc; + + if (num_blocks == 0) { + return SPDK_SCSI_TASK_COMPLETE; + } + + bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + + if (lba >= bdev_num_blocks || num_blocks > bdev_num_blocks || + lba > (bdev_num_blocks - num_blocks)) { + SPDK_ERRLOG("end of media\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + rc = spdk_bdev_flush_blocks(bdev_desc, bdev_ch, lba, num_blocks, + spdk_bdev_scsi_task_complete_cmd, task); + + if (rc) { + if (rc == -ENOMEM) { + spdk_bdev_scsi_queue_io(task, spdk_bdev_scsi_process_block_resubmit, task); + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("spdk_bdev_flush_blocks() failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + task->data_transferred = 0; + return SPDK_SCSI_TASK_PENDING; +} + +static int +spdk_bdev_scsi_readwrite(struct spdk_scsi_task *task, + uint64_t lba, uint32_t xfer_len, bool is_read) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + struct spdk_bdev_desc *bdev_desc = lun->bdev_desc; + struct spdk_io_channel *bdev_ch = lun->io_channel; + uint64_t bdev_num_blocks; + uint32_t max_xfer_len; + + task->data_transferred = 0; + + if (spdk_unlikely(task->dxfer_dir != SPDK_SCSI_DIR_NONE && + task->dxfer_dir != (is_read ? SPDK_SCSI_DIR_FROM_DEV : SPDK_SCSI_DIR_TO_DEV))) { + SPDK_ERRLOG("Incorrect data direction\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + if (spdk_unlikely(bdev_num_blocks <= lba || bdev_num_blocks - lba < xfer_len)) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "end of media\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + if (spdk_unlikely(xfer_len == 0)) { + task->status = SPDK_SCSI_STATUS_GOOD; + return SPDK_SCSI_TASK_COMPLETE; + } + + /* Transfer Length is limited to the Block Limits VPD page Maximum Transfer Length */ + max_xfer_len = SPDK_WORK_BLOCK_SIZE / spdk_bdev_get_block_size(bdev); + if (spdk_unlikely(xfer_len > max_xfer_len)) { + SPDK_ERRLOG("xfer_len %" PRIu32 " > maximum transfer length %" PRIu32 "\n", + xfer_len, max_xfer_len); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + if (is_read) { + return spdk_bdev_scsi_read(bdev, bdev_desc, bdev_ch, task, lba, xfer_len); + } else { + return spdk_bdev_scsi_write(bdev, bdev_desc, bdev_ch, task, lba, xfer_len); + } +} + +struct spdk_bdev_scsi_unmap_ctx { + struct spdk_scsi_task *task; + struct spdk_scsi_unmap_bdesc desc[DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT]; + uint32_t count; +}; + +static int spdk_bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, struct spdk_bdev_scsi_unmap_ctx *ctx); + +static void +spdk_bdev_scsi_task_complete_unmap_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_bdev_scsi_unmap_ctx *ctx = cb_arg; + struct spdk_scsi_task *task = ctx->task; + int sc, sk, asc, ascq; + + ctx->count--; + + task->bdev_io = bdev_io; + + if (task->status == SPDK_SCSI_STATUS_GOOD) { + spdk_bdev_io_get_scsi_status(bdev_io, &sc, &sk, &asc, &ascq); + spdk_scsi_task_set_status(task, sc, sk, asc, ascq); + } + + if (ctx->count == 0) { + spdk_scsi_lun_complete_task(task->lun, task); + free(ctx); + } +} + +static int +__copy_desc(struct spdk_bdev_scsi_unmap_ctx *ctx, uint8_t *data, size_t data_len) +{ + uint16_t desc_data_len; + uint16_t desc_count; + + if (!data) { + return -EINVAL; + } + + if (data_len < 8) { + /* We can't even get the reported length, so fail. */ + return -EINVAL; + } + + desc_data_len = from_be16(&data[2]); + desc_count = desc_data_len / 16; + + if (desc_data_len > (data_len - 8)) { + SPDK_ERRLOG("Error - desc_data_len (%u) > data_len (%lu) - 8\n", + desc_data_len, data_len); + return -EINVAL; + } + + if (desc_count > DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT) { + SPDK_ERRLOG("desc_count (%u) greater than max allowed (%u)\n", + desc_count, DEFAULT_MAX_UNMAP_BLOCK_DESCRIPTOR_COUNT); + return -EINVAL; + } + + memcpy(ctx->desc, &data[8], desc_data_len); + return desc_count; +} + +static void +spdk_bdev_scsi_unmap_resubmit(void *arg) +{ + struct spdk_bdev_scsi_unmap_ctx *ctx = arg; + struct spdk_scsi_task *task = ctx->task; + struct spdk_scsi_lun *lun = task->lun; + + spdk_bdev_scsi_unmap(lun->bdev, lun->bdev_desc, lun->io_channel, task, ctx); +} + +static int +spdk_bdev_scsi_unmap(struct spdk_bdev *bdev, struct spdk_bdev_desc *bdev_desc, + struct spdk_io_channel *bdev_ch, struct spdk_scsi_task *task, struct spdk_bdev_scsi_unmap_ctx *ctx) +{ + uint8_t *data; + int desc_count, i; + int data_len; + int rc; + + assert(task->status == SPDK_SCSI_STATUS_GOOD); + + if (ctx == NULL) { + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + + ctx->task = task; + ctx->count = 0; + } + + + if (task->iovcnt == 1) { + data = (uint8_t *)task->iovs[0].iov_base; + data_len = task->iovs[0].iov_len; + desc_count = __copy_desc(ctx, data, data_len); + } else { + data = spdk_scsi_task_gather_data(task, &data_len); + desc_count = __copy_desc(ctx, data, data_len); + if (desc_count < 0) { + spdk_dma_free(data); + } + } + + if (desc_count < 0) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + free(ctx); + return SPDK_SCSI_TASK_COMPLETE; + } + + for (i = ctx->count; i < desc_count; i++) { + struct spdk_scsi_unmap_bdesc *desc; + uint64_t offset_blocks; + uint64_t num_blocks; + + desc = &ctx->desc[i]; + + offset_blocks = from_be64(&desc->lba); + num_blocks = from_be32(&desc->block_count); + + if (num_blocks == 0) { + continue; + } + + ctx->count++; + rc = spdk_bdev_unmap_blocks(bdev_desc, bdev_ch, offset_blocks, num_blocks, + spdk_bdev_scsi_task_complete_unmap_cmd, ctx); + + if (rc) { + if (rc == -ENOMEM) { + spdk_bdev_scsi_queue_io(task, spdk_bdev_scsi_unmap_resubmit, ctx); + /* Unmap was not yet submitted to bdev */ + ctx->count--; + return SPDK_SCSI_TASK_PENDING; + } + SPDK_ERRLOG("SCSI Unmapping failed\n"); + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + ctx->count--; + /* We can't complete here - we may have to wait for previously + * submitted unmaps to complete */ + break; + } + } + + if (ctx->count == 0) { + free(ctx); + return SPDK_SCSI_TASK_COMPLETE; + } + + return SPDK_SCSI_TASK_PENDING; +} + +static int +spdk_bdev_scsi_process_block(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + uint64_t lba; + uint32_t xfer_len; + uint32_t len = 0; + uint8_t *cdb = task->cdb; + + /* XXX: We need to support FUA bit for writes! */ + switch (cdb[0]) { + case SPDK_SBC_READ_6: + case SPDK_SBC_WRITE_6: + lba = (uint64_t)cdb[1] << 16; + lba |= (uint64_t)cdb[2] << 8; + lba |= (uint64_t)cdb[3]; + xfer_len = cdb[4]; + if (xfer_len == 0) { + xfer_len = 256; + } + return spdk_bdev_scsi_readwrite(task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_6); + + case SPDK_SBC_READ_10: + case SPDK_SBC_WRITE_10: + lba = from_be32(&cdb[2]); + xfer_len = from_be16(&cdb[7]); + return spdk_bdev_scsi_readwrite(task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_10); + + case SPDK_SBC_READ_12: + case SPDK_SBC_WRITE_12: + lba = from_be32(&cdb[2]); + xfer_len = from_be32(&cdb[6]); + return spdk_bdev_scsi_readwrite(task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_12); + case SPDK_SBC_READ_16: + case SPDK_SBC_WRITE_16: + lba = from_be64(&cdb[2]); + xfer_len = from_be32(&cdb[10]); + return spdk_bdev_scsi_readwrite(task, lba, xfer_len, + cdb[0] == SPDK_SBC_READ_16); + + case SPDK_SBC_READ_CAPACITY_10: { + uint64_t num_blocks = spdk_bdev_get_num_blocks(bdev); + uint8_t buffer[8]; + + if (num_blocks - 1 > 0xffffffffULL) { + memset(buffer, 0xff, 4); + } else { + to_be32(buffer, num_blocks - 1); + } + to_be32(&buffer[4], spdk_bdev_get_block_size(bdev)); + + len = spdk_min(task->length, sizeof(buffer)); + if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) { + break; + } + + task->data_transferred = len; + task->status = SPDK_SCSI_STATUS_GOOD; + break; + } + + case SPDK_SPC_SERVICE_ACTION_IN_16: + switch (cdb[1] & 0x1f) { /* SERVICE ACTION */ + case SPDK_SBC_SAI_READ_CAPACITY_16: { + uint8_t buffer[32] = {0}; + + to_be64(&buffer[0], spdk_bdev_get_num_blocks(bdev) - 1); + to_be32(&buffer[8], spdk_bdev_get_block_size(bdev)); + /* + * Set the TPE bit to 1 to indicate thin provisioning. + * The position of TPE bit is the 7th bit in 14th byte + * in READ CAPACITY (16) parameter data. + */ + if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_UNMAP)) { + buffer[14] |= 1 << 7; + } + + len = spdk_min(from_be32(&cdb[10]), sizeof(buffer)); + if (spdk_scsi_task_scatter_data(task, buffer, len) < 0) { + break; + } + + task->data_transferred = len; + task->status = SPDK_SCSI_STATUS_GOOD; + break; + } + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + break; + + case SPDK_SBC_SYNCHRONIZE_CACHE_10: + case SPDK_SBC_SYNCHRONIZE_CACHE_16: + if (cdb[0] == SPDK_SBC_SYNCHRONIZE_CACHE_10) { + lba = from_be32(&cdb[2]); + len = from_be16(&cdb[7]); + } else { + lba = from_be64(&cdb[2]); + len = from_be32(&cdb[10]); + } + + if (len == 0) { + len = spdk_bdev_get_num_blocks(bdev) - lba; + } + + return spdk_bdev_scsi_sync(bdev, lun->bdev_desc, lun->io_channel, task, lba, len); + break; + + case SPDK_SBC_UNMAP: + return spdk_bdev_scsi_unmap(bdev, lun->bdev_desc, lun->io_channel, task, NULL); + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + + return SPDK_SCSI_TASK_COMPLETE; +} + +static void +spdk_bdev_scsi_process_block_resubmit(void *arg) +{ + struct spdk_scsi_task *task = arg; + + spdk_bdev_scsi_process_block(task); +} + +static int +spdk_bdev_scsi_check_len(struct spdk_scsi_task *task, int len, int min_len) +{ + if (len >= min_len) { + return 0; + } + + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; +} + +static int +spdk_bdev_scsi_process_primary(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + struct spdk_bdev *bdev = lun->bdev; + int alloc_len = -1; + int data_len = -1; + uint8_t *cdb = task->cdb; + uint8_t *data = NULL; + int rc = 0; + int pllen, md = 0; + int pf, sp; + int bdlen = 0, llba; + int dbd, pc, page, subpage; + int cmd_parsed = 0; + + + switch (cdb[0]) { + case SPDK_SPC_INQUIRY: + alloc_len = from_be16(&cdb[3]); + data_len = spdk_max(4096, alloc_len); + data = spdk_dma_zmalloc(data_len, 0, NULL); + assert(data != NULL); + rc = spdk_bdev_scsi_inquiry(bdev, task, cdb, data, data_len); + data_len = spdk_min(rc, data_len); + if (rc < 0) { + break; + } + + SPDK_TRACEDUMP(SPDK_LOG_SCSI, "INQUIRY", data, data_len); + break; + + case SPDK_SPC_REPORT_LUNS: { + int sel; + + sel = cdb[2]; + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "sel=%x\n", sel); + + alloc_len = from_be32(&cdb[6]); + rc = spdk_bdev_scsi_check_len(task, alloc_len, 16); + if (rc < 0) { + break; + } + + data_len = spdk_max(4096, alloc_len); + data = spdk_dma_zmalloc(data_len, 0, NULL); + assert(data != NULL); + rc = spdk_bdev_scsi_report_luns(task->lun, sel, data, data_len); + data_len = rc; + if (rc < 0) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + SPDK_TRACEDUMP(SPDK_LOG_SCSI, "REPORT LUNS", data, data_len); + break; + } + + case SPDK_SPC_MODE_SELECT_6: + case SPDK_SPC_MODE_SELECT_10: + if (cdb[0] == SPDK_SPC_MODE_SELECT_6) { + /* MODE_SELECT(6) must have at least a 4 byte header. */ + md = 4; + pllen = cdb[4]; + } else { + /* MODE_SELECT(10) must have at least an 8 byte header. */ + md = 8; + pllen = from_be16(&cdb[7]); + } + + if (pllen == 0) { + break; + } + + rc = spdk_bdev_scsi_check_len(task, pllen, md); + if (rc < 0) { + break; + } + + data = spdk_scsi_task_gather_data(task, &rc); + if (rc < 0) { + break; + } + + data_len = rc; + if (cdb[0] == SPDK_SPC_MODE_SELECT_6) { + rc = spdk_bdev_scsi_check_len(task, data_len, 4); + if (rc >= 0) { + bdlen = data[3]; + } + + } else { + rc = spdk_bdev_scsi_check_len(task, data_len, 8); + if (rc >= 0) { + bdlen = from_be16(&data[6]); + } + } + + if (rc < 0) { + break; + } + pf = !!(cdb[1] & 0x10); + sp = !!(cdb[1] & 0x1); + + /* page data */ + rc = spdk_bdev_scsi_mode_select_page( + bdev, cdb, + pf, sp, + &data[md + bdlen], + pllen - (md + bdlen)); + if (rc < 0) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_NO_SENSE, + SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + rc = pllen; + data_len = 0; + break; + + case SPDK_SPC_MODE_SENSE_6: + alloc_len = cdb[4]; + md = 6; + /* FALLTHROUGH */ + case SPDK_SPC_MODE_SENSE_10: + llba = 0; + + if (md == 0) { + alloc_len = from_be16(&cdb[7]); + llba = !!(cdb[1] & 0x10); + md = 10; + } + + dbd = !!(cdb[1] & 0x8); + pc = (cdb[2] & 0xc0) >> 6; + page = cdb[2] & 0x3f; + subpage = cdb[3]; + + /* First call with no buffer to discover needed buffer size */ + rc = spdk_bdev_scsi_mode_sense(bdev, md, + cdb, dbd, llba, pc, + page, subpage, + NULL, task); + if (rc < 0) { + break; + } + + data_len = rc; + data = spdk_dma_zmalloc(data_len, 0, NULL); + assert(data != NULL); + + /* First call with no buffer to discover needed buffer size */ + rc = spdk_bdev_scsi_mode_sense(bdev, md, + cdb, dbd, llba, pc, + page, subpage, + data, task); + if (rc < 0) { + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + break; + + case SPDK_SPC_REQUEST_SENSE: { + int desc; + int sk, asc, ascq; + + desc = cdb[1] & 0x1; + if (desc != 0) { + /* INVALID FIELD IN CDB */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + alloc_len = cdb[4]; + + /* NO ADDITIONAL SENSE INFORMATION */ + sk = SPDK_SCSI_SENSE_NO_SENSE; + asc = 0x00; + ascq = 0x00; + + spdk_scsi_task_build_sense_data(task, sk, asc, ascq); + + data_len = task->sense_data_len; + data = spdk_dma_zmalloc(data_len, 0, NULL); + assert(data != NULL); + memcpy(data, task->sense_data, data_len); + break; + } + + case SPDK_SPC_LOG_SELECT: + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SELECT\n"); + cmd_parsed = 1; + /* FALLTHROUGH */ + case SPDK_SPC_LOG_SENSE: + if (!cmd_parsed) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "LOG_SENSE\n"); + } + + /* INVALID COMMAND OPERATION CODE */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + rc = -1; + break; + + case SPDK_SPC_TEST_UNIT_READY: + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "TEST_UNIT_READY\n"); + cmd_parsed = 1; + /* FALLTHROUGH */ + case SPDK_SBC_START_STOP_UNIT: + if (!cmd_parsed) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "START_STOP_UNIT\n"); + } + + rc = 0; + break; + + default: + return SPDK_SCSI_TASK_UNKNOWN; + } + + if (rc >= 0 && data_len > 0) { + assert(alloc_len >= 0); + spdk_scsi_task_scatter_data(task, data, spdk_min(alloc_len, data_len)); + rc = spdk_min(data_len, alloc_len); + } + + if (rc >= 0) { + task->data_transferred = rc; + task->status = SPDK_SCSI_STATUS_GOOD; + } + + if (data) { + spdk_dma_free(data); + } + + return SPDK_SCSI_TASK_COMPLETE; +} + +int +spdk_bdev_scsi_execute(struct spdk_scsi_task *task) +{ + int rc; + + if ((rc = spdk_bdev_scsi_process_block(task)) == SPDK_SCSI_TASK_UNKNOWN) { + if ((rc = spdk_bdev_scsi_process_primary(task)) == SPDK_SCSI_TASK_UNKNOWN) { + SPDK_DEBUGLOG(SPDK_LOG_SCSI, "unsupported SCSI OP=0x%x\n", task->cdb[0]); + /* INVALID COMMAND OPERATION CODE */ + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_COMMAND_OPERATION_CODE, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return SPDK_SCSI_TASK_COMPLETE; + } + } + + return rc; +} + +static void +spdk_bdev_scsi_reset_resubmit(void *arg) +{ + struct spdk_scsi_task *task = arg; + + spdk_bdev_scsi_reset(task); +} + +void +spdk_bdev_scsi_reset(struct spdk_scsi_task *task) +{ + struct spdk_scsi_lun *lun = task->lun; + int rc; + + rc = spdk_bdev_reset(lun->bdev_desc, lun->io_channel, spdk_bdev_scsi_task_complete_mgmt, task); + if (rc == -ENOMEM) { + spdk_bdev_scsi_queue_io(task, spdk_bdev_scsi_reset_resubmit, task); + } +} diff --git a/src/spdk/lib/scsi/scsi_internal.h b/src/spdk/lib/scsi/scsi_internal.h new file mode 100644 index 00000000..85caf762 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_internal.h @@ -0,0 +1,160 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_SCSI_INTERNAL_H +#define SPDK_SCSI_INTERNAL_H + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/trace.h" + +#include "spdk_internal/log.h" + +enum { + SPDK_SCSI_TASK_UNKNOWN = -1, + SPDK_SCSI_TASK_COMPLETE, + SPDK_SCSI_TASK_PENDING, +}; + +struct spdk_scsi_port { + uint8_t is_used; + uint64_t id; + uint16_t index; + char name[SPDK_SCSI_PORT_MAX_NAME_LENGTH]; +}; + +struct spdk_scsi_dev { + int id; + int is_allocated; + bool removed; + + char name[SPDK_SCSI_DEV_MAX_NAME + 1]; + + struct spdk_scsi_lun *lun[SPDK_SCSI_DEV_MAX_LUN]; + + int num_ports; + struct spdk_scsi_port port[SPDK_SCSI_DEV_MAX_PORTS]; + + uint8_t protocol_id; +}; + +struct spdk_scsi_desc { + struct spdk_scsi_lun *lun; + spdk_scsi_remove_cb_t hotremove_cb; + void *hotremove_ctx; + TAILQ_ENTRY(spdk_scsi_desc) link; +}; + +struct spdk_scsi_lun { + /** LUN id for this logical unit. */ + int id; + + /** Pointer to the SCSI device containing this LUN. */ + struct spdk_scsi_dev *dev; + + /** The bdev associated with this LUN. */ + struct spdk_bdev *bdev; + + /** Descriptor for opened block device. */ + struct spdk_bdev_desc *bdev_desc; + + /** I/O channel for the bdev associated with this LUN. */ + struct spdk_io_channel *io_channel; + + /** The reference number for this LUN, thus we can correctly free the io_channel */ + uint32_t ref; + + /** Poller to release the resource of the lun when it is hot removed */ + struct spdk_poller *hotremove_poller; + + /** The LUN is removed */ + bool removed; + + /** Callback to be fired when LUN removal is first triggered. */ + void (*hotremove_cb)(const struct spdk_scsi_lun *lun, void *arg); + + /** Argument for hotremove_cb */ + void *hotremove_ctx; + + /** List of open descriptors for this LUN. */ + TAILQ_HEAD(, spdk_scsi_desc) open_descs; + + /** pending tasks */ + TAILQ_HEAD(tasks, spdk_scsi_task) tasks; +}; + +struct spdk_lun_db_entry { + struct spdk_scsi_lun *lun; + struct spdk_lun_db_entry *next; +}; + +extern struct spdk_lun_db_entry *spdk_scsi_lun_list_head; + +/* This typedef exists to work around an astyle 2.05 bug. + * Remove it when astyle is fixed. + */ +typedef struct spdk_scsi_lun _spdk_scsi_lun; + +_spdk_scsi_lun *spdk_scsi_lun_construct(struct spdk_bdev *bdev, + void (*hotremove_cb)(const struct spdk_scsi_lun *, void *), + void *hotremove_ctx); +void spdk_scsi_lun_destruct(struct spdk_scsi_lun *lun); + +void spdk_scsi_lun_execute_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +int spdk_scsi_lun_task_mgmt_execute(struct spdk_scsi_task *task, enum spdk_scsi_task_func func); +void spdk_scsi_lun_complete_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +void spdk_scsi_lun_complete_mgmt_task(struct spdk_scsi_lun *lun, struct spdk_scsi_task *task); +bool spdk_scsi_lun_has_pending_tasks(const struct spdk_scsi_lun *lun); +int _spdk_scsi_lun_allocate_io_channel(struct spdk_scsi_lun *lun); +void _spdk_scsi_lun_free_io_channel(struct spdk_scsi_lun *lun); + +struct spdk_scsi_dev *spdk_scsi_dev_get_list(void); + +int spdk_scsi_port_construct(struct spdk_scsi_port *port, uint64_t id, + uint16_t index, const char *name); +void spdk_scsi_port_destruct(struct spdk_scsi_port *port); + +int spdk_bdev_scsi_execute(struct spdk_scsi_task *task); +void spdk_bdev_scsi_reset(struct spdk_scsi_task *task); + +struct spdk_scsi_globals { + pthread_mutex_t mutex; +}; + +extern struct spdk_scsi_globals g_spdk_scsi; + +#endif /* SPDK_SCSI_INTERNAL_H */ diff --git a/src/spdk/lib/scsi/scsi_rpc.c b/src/spdk/lib/scsi/scsi_rpc.c new file mode 100644 index 00000000..150069a9 --- /dev/null +++ b/src/spdk/lib/scsi/scsi_rpc.c @@ -0,0 +1,82 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" + +#include "spdk/rpc.h" +#include "spdk/util.h" + +static void +spdk_rpc_get_scsi_devices(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct spdk_json_write_ctx *w; + struct spdk_scsi_dev *devs = spdk_scsi_dev_get_list(); + int i; + + if (params != NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "get_scsi_devices requires no parameters"); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_array_begin(w); + + for (i = 0; i < SPDK_SCSI_MAX_DEVS; i++) { + struct spdk_scsi_dev *dev = &devs[i]; + + if (!dev->is_allocated) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, dev->id); + + spdk_json_write_name(w, "device_name"); + spdk_json_write_string(w, dev->name); + + spdk_json_write_object_end(w); + } + spdk_json_write_array_end(w); + + spdk_jsonrpc_end_result(request, w); +} +SPDK_RPC_REGISTER("get_scsi_devices", spdk_rpc_get_scsi_devices, SPDK_RPC_RUNTIME) diff --git a/src/spdk/lib/scsi/task.c b/src/spdk/lib/scsi/task.c new file mode 100644 index 00000000..6ddc0085 --- /dev/null +++ b/src/spdk/lib/scsi/task.c @@ -0,0 +1,256 @@ +/*- + * BSD LICENSE + * + * Copyright (C) 2008-2012 Daisuke Aoyama . + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "scsi_internal.h" +#include "spdk/endian.h" +#include "spdk/env.h" +#include "spdk/util.h" + +static void +spdk_scsi_task_free_data(struct spdk_scsi_task *task); + +void +spdk_scsi_task_put(struct spdk_scsi_task *task) +{ + if (!task) { + return; + } + + task->ref--; + + if (task->ref == 0) { + struct spdk_bdev_io *bdev_io = task->bdev_io; + + if (bdev_io) { + spdk_bdev_free_io(bdev_io); + } + + spdk_scsi_task_free_data(task); + + task->free_fn(task); + } +} + +void +spdk_scsi_task_construct(struct spdk_scsi_task *task, + spdk_scsi_task_cpl cpl_fn, + spdk_scsi_task_free free_fn) +{ + assert(task != NULL); + assert(cpl_fn != NULL); + assert(free_fn != NULL); + + task->cpl_fn = cpl_fn; + task->free_fn = free_fn; + + task->ref++; + + /* + * Pre-fill the iov_buffers to point to the embedded iov + */ + assert(task->iov.iov_base == NULL); + task->iovs = &task->iov; + task->iovcnt = 1; +} + +static void +spdk_scsi_task_free_data(struct spdk_scsi_task *task) +{ + if (task->alloc_len != 0) { + spdk_dma_free(task->iov.iov_base); + task->alloc_len = 0; + } + + task->iov.iov_base = NULL; + task->iov.iov_len = 0; +} + +static void * +spdk_scsi_task_alloc_data(struct spdk_scsi_task *task, uint32_t alloc_len) +{ + assert(task->alloc_len == 0); + + task->iov.iov_base = spdk_dma_zmalloc(alloc_len, 0, NULL); + task->iov.iov_len = alloc_len; + task->alloc_len = alloc_len; + + return task->iov.iov_base; +} + +int +spdk_scsi_task_scatter_data(struct spdk_scsi_task *task, const void *src, size_t buf_len) +{ + size_t len = 0; + size_t buf_left = buf_len; + int i; + struct iovec *iovs = task->iovs; + const uint8_t *pos; + + if (buf_len == 0) { + return 0; + } + + if (task->iovcnt == 1 && iovs[0].iov_base == NULL) { + spdk_scsi_task_alloc_data(task, buf_len); + iovs[0] = task->iov; + } + + for (i = 0; i < task->iovcnt; i++) { + assert(iovs[i].iov_base != NULL); + len += iovs[i].iov_len; + } + + if (len < buf_len) { + spdk_scsi_task_set_status(task, SPDK_SCSI_STATUS_CHECK_CONDITION, + SPDK_SCSI_SENSE_ILLEGAL_REQUEST, + SPDK_SCSI_ASC_INVALID_FIELD_IN_CDB, + SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + return -1; + } + + pos = src; + + for (i = 0; i < task->iovcnt; i++) { + len = spdk_min(iovs[i].iov_len, buf_left); + buf_left -= len; + memcpy(iovs[i].iov_base, pos, len); + pos += len; + } + + return buf_len; +} + +void * +spdk_scsi_task_gather_data(struct spdk_scsi_task *task, int *len) +{ + int i; + struct iovec *iovs = task->iovs; + size_t buf_len = 0; + uint8_t *buf, *pos; + + for (i = 0; i < task->iovcnt; i++) { + assert(iovs[i].iov_base != NULL); + buf_len += iovs[i].iov_len; + } + + if (buf_len == 0) { + *len = 0; + return NULL; + } + + buf = spdk_dma_malloc(buf_len, 0, NULL); + if (buf == NULL) { + *len = -1; + return NULL; + } + + pos = buf; + for (i = 0; i < task->iovcnt; i++) { + memcpy(pos, iovs[i].iov_base, iovs[i].iov_len); + pos += iovs[i].iov_len; + } + + *len = buf_len; + return buf; +} + +void +spdk_scsi_task_set_data(struct spdk_scsi_task *task, void *data, uint32_t len) +{ + assert(task->iovcnt == 1); + assert(task->alloc_len == 0); + + task->iovs[0].iov_base = data; + task->iovs[0].iov_len = len; +} + +void +spdk_scsi_task_build_sense_data(struct spdk_scsi_task *task, int sk, int asc, int ascq) +{ + uint8_t *cp; + int resp_code; + + resp_code = 0x70; /* Current + Fixed format */ + + /* Sense Data */ + cp = task->sense_data; + + /* VALID(7) RESPONSE CODE(6-0) */ + cp[0] = 0x80 | resp_code; + /* Obsolete */ + cp[1] = 0; + /* FILEMARK(7) EOM(6) ILI(5) SENSE KEY(3-0) */ + cp[2] = sk & 0xf; + /* INFORMATION */ + memset(&cp[3], 0, 4); + + /* ADDITIONAL SENSE LENGTH */ + cp[7] = 10; + + /* COMMAND-SPECIFIC INFORMATION */ + memset(&cp[8], 0, 4); + /* ADDITIONAL SENSE CODE */ + cp[12] = asc; + /* ADDITIONAL SENSE CODE QUALIFIER */ + cp[13] = ascq; + /* FIELD REPLACEABLE UNIT CODE */ + cp[14] = 0; + + /* SKSV(7) SENSE KEY SPECIFIC(6-0,7-0,7-0) */ + cp[15] = 0; + cp[16] = 0; + cp[17] = 0; + + /* SenseLength */ + task->sense_data_len = 18; +} + +void +spdk_scsi_task_set_status(struct spdk_scsi_task *task, int sc, int sk, + int asc, int ascq) +{ + if (sc == SPDK_SCSI_STATUS_CHECK_CONDITION) { + spdk_scsi_task_build_sense_data(task, sk, asc, ascq); + } + task->status = sc; +} + +void +spdk_scsi_task_copy_status(struct spdk_scsi_task *dst, + struct spdk_scsi_task *src) +{ + memcpy(dst->sense_data, src->sense_data, src->sense_data_len); + dst->sense_data_len = src->sense_data_len; + dst->status = src->status; +} diff --git a/src/spdk/lib/sock/Makefile b/src/spdk/lib/sock/Makefile new file mode 100644 index 00000000..8860556d --- /dev/null +++ b/src/spdk/lib/sock/Makefile @@ -0,0 +1,44 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = sock.c net_framework.c + +LIBNAME = sock + +DIRS-y += posix +DIRS-$(CONFIG_VPP) += vpp + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/sock/net_framework.c b/src/spdk/lib/sock/net_framework.c new file mode 100644 index 00000000..5d5a568f --- /dev/null +++ b/src/spdk/lib/sock/net_framework.c @@ -0,0 +1,70 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/log.h" +#include "spdk/net.h" +#include "spdk/queue.h" + +static STAILQ_HEAD(, spdk_net_framework) g_net_frameworks = + STAILQ_HEAD_INITIALIZER(g_net_frameworks); + +int spdk_net_framework_start(void) +{ + struct spdk_net_framework *net_framework = NULL; + int rc; + + STAILQ_FOREACH_FROM(net_framework, &g_net_frameworks, link) { + rc = net_framework->init(); + if (rc != 0) { + SPDK_ERRLOG("Net framework %s failed to initalize\n", net_framework->name); + return rc; + } + } + + return 0; +} + +void spdk_net_framework_fini(void) +{ + struct spdk_net_framework *net_framework = NULL; + + STAILQ_FOREACH_FROM(net_framework, &g_net_frameworks, link) { + net_framework->fini(); + } +} + +void +spdk_net_framework_register(struct spdk_net_framework *frame) +{ + STAILQ_INSERT_TAIL(&g_net_frameworks, frame, link); +} diff --git a/src/spdk/lib/sock/posix/Makefile b/src/spdk/lib/sock/posix/Makefile new file mode 100644 index 00000000..540694c4 --- /dev/null +++ b/src/spdk/lib/sock/posix/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +LIBNAME = sock_posix +C_SRCS = posix.c + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/sock/posix/posix.c b/src/spdk/lib/sock/posix/posix.c new file mode 100644 index 00000000..565d3892 --- /dev/null +++ b/src/spdk/lib/sock/posix/posix.c @@ -0,0 +1,604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif + +#include "spdk/log.h" +#include "spdk/sock.h" +#include "spdk_internal/sock.h" + +#define MAX_TMPBUF 1024 +#define PORTNUMLEN 32 + +struct spdk_posix_sock { + struct spdk_sock base; + int fd; +}; + +struct spdk_posix_sock_group_impl { + struct spdk_sock_group_impl base; + int fd; +}; + +static int +get_addr_str(struct sockaddr *sa, char *host, size_t hlen) +{ + const char *result = NULL; + + if (sa == NULL || host == NULL) { + return -1; + } + + switch (sa->sa_family) { + case AF_INET: + result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr), + host, hlen); + break; + case AF_INET6: + result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr), + host, hlen); + break; + default: + break; + } + + if (result != NULL) { + return 0; + } else { + return -1; + } +} + +#define __posix_sock(sock) (struct spdk_posix_sock *)sock +#define __posix_group_impl(group) (struct spdk_posix_sock_group_impl *)group + +static int +spdk_posix_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, + char *caddr, int clen, uint16_t *cport) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return -1; + } + + switch (sa.ss_family) { + case AF_UNIX: + /* Acceptable connection types that don't have IPs */ + return 0; + case AF_INET: + case AF_INET6: + /* Code below will get IP addresses */ + break; + default: + /* Unsupported socket family */ + return -1; + } + + rc = get_addr_str((struct sockaddr *)&sa, saddr, slen); + if (rc != 0) { + SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); + return -1; + } + + if (sport) { + if (sa.ss_family == AF_INET) { + *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getpeername(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getpeername() failed (errno=%d)\n", errno); + return -1; + } + + rc = get_addr_str((struct sockaddr *)&sa, caddr, clen); + if (rc != 0) { + SPDK_ERRLOG("getnameinfo() failed (errno=%d)\n", errno); + return -1; + } + + if (cport) { + if (sa.ss_family == AF_INET) { + *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + return 0; +} + +enum spdk_posix_sock_create_type { + SPDK_SOCK_CREATE_LISTEN, + SPDK_SOCK_CREATE_CONNECT, +}; + +static struct spdk_sock * +spdk_posix_sock_create(const char *ip, int port, enum spdk_posix_sock_create_type type) +{ + struct spdk_posix_sock *sock; + char buf[MAX_TMPBUF]; + char portnum[PORTNUMLEN]; + char *p; + struct addrinfo hints, *res, *res0; + int fd, flag; + int val = 1; + int rc; + + if (ip == NULL) { + return NULL; + } + if (ip[0] == '[') { + snprintf(buf, sizeof(buf), "%s", ip + 1); + p = strchr(buf, ']'); + if (p != NULL) { + *p = '\0'; + } + ip = (const char *) &buf[0]; + } + + snprintf(portnum, sizeof portnum, "%d", port); + memset(&hints, 0, sizeof hints); + hints.ai_family = PF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_flags |= AI_PASSIVE; + hints.ai_flags |= AI_NUMERICHOST; + rc = getaddrinfo(ip, portnum, &hints, &res0); + if (rc != 0) { + SPDK_ERRLOG("getaddrinfo() failed (errno=%d)\n", errno); + return NULL; + } + + /* try listen */ + fd = -1; + for (res = res0; res != NULL; res = res->ai_next) { +retry: + fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); + if (fd < 0) { + /* error */ + continue; + } + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + + if (res->ai_family == AF_INET6) { + rc = setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &val, sizeof val); + if (rc != 0) { + close(fd); + /* error */ + continue; + } + } + + if (type == SPDK_SOCK_CREATE_LISTEN) { + rc = bind(fd, res->ai_addr, res->ai_addrlen); + if (rc != 0) { + SPDK_ERRLOG("bind() failed, errno = %d\n", errno); + switch (errno) { + case EINTR: + /* interrupted? */ + close(fd); + goto retry; + case EADDRNOTAVAIL: + SPDK_ERRLOG("IP address %s not available. " + "Verify IP address in config file " + "and make sure setup script is " + "run before starting spdk app.\n", ip); + /* FALLTHROUGH */ + default: + /* try next family */ + close(fd); + fd = -1; + continue; + } + } + /* bind OK */ + rc = listen(fd, 512); + if (rc != 0) { + SPDK_ERRLOG("listen() failed, errno = %d\n", errno); + close(fd); + fd = -1; + break; + } + } else if (type == SPDK_SOCK_CREATE_CONNECT) { + rc = connect(fd, res->ai_addr, res->ai_addrlen); + if (rc != 0) { + SPDK_ERRLOG("connect() failed, errno = %d\n", errno); + /* try next family */ + close(fd); + fd = -1; + continue; + } + } + + flag = fcntl(fd, F_GETFL); + if (fcntl(fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", fd, errno); + close(fd); + fd = -1; + break; + } + break; + } + freeaddrinfo(res0); + + if (fd < 0) { + return NULL; + } + + sock = calloc(1, sizeof(*sock)); + if (sock == NULL) { + SPDK_ERRLOG("sock allocation failed\n"); + close(fd); + return NULL; + } + + sock->fd = fd; + return &sock->base; +} + +static struct spdk_sock * +spdk_posix_sock_listen(const char *ip, int port) +{ + return spdk_posix_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN); +} + +static struct spdk_sock * +spdk_posix_sock_connect(const char *ip, int port) +{ + return spdk_posix_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT); +} + +static struct spdk_sock * +spdk_posix_sock_accept(struct spdk_sock *_sock) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + struct spdk_posix_sock *new_sock; + int flag; + + memset(&sa, 0, sizeof(sa)); + salen = sizeof(sa); + + assert(sock != NULL); + + rc = accept(sock->fd, (struct sockaddr *)&sa, &salen); + + if (rc == -1) { + return NULL; + } + + flag = fcntl(rc, F_GETFL); + if ((!(flag & O_NONBLOCK)) && (fcntl(rc, F_SETFL, flag | O_NONBLOCK) < 0)) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%d)\n", rc, errno); + close(rc); + return NULL; + } + + new_sock = calloc(1, sizeof(*sock)); + if (new_sock == NULL) { + SPDK_ERRLOG("sock allocation failed\n"); + close(rc); + return NULL; + } + + new_sock->fd = rc; + return &new_sock->base; +} + +static int +spdk_posix_sock_close(struct spdk_sock *_sock) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + int rc; + + rc = close(sock->fd); + if (rc == 0) { + free(sock); + } + + return rc; +} + +static ssize_t +spdk_posix_sock_recv(struct spdk_sock *_sock, void *buf, size_t len) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + + return recv(sock->fd, buf, len, MSG_DONTWAIT); +} + +static ssize_t +spdk_posix_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + + return writev(sock->fd, iov, iovcnt); +} + +static int +spdk_posix_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + int val; + int rc; + + assert(sock != NULL); + + val = nbytes; + rc = setsockopt(sock->fd, SOL_SOCKET, SO_RCVLOWAT, &val, sizeof val); + if (rc != 0) { + return -1; + } + return 0; +} + +static int +spdk_posix_sock_set_recvbuf(struct spdk_sock *_sock, int sz) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + + assert(sock != NULL); + + return setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, + &sz, sizeof(sz)); +} + +static int +spdk_posix_sock_set_sendbuf(struct spdk_sock *_sock, int sz) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + + assert(sock != NULL); + + return setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, + &sz, sizeof(sz)); +} + +static bool +spdk_posix_sock_is_ipv6(struct spdk_sock *_sock) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return false; + } + + return (sa.ss_family == AF_INET6); +} + +static bool +spdk_posix_sock_is_ipv4(struct spdk_sock *_sock) +{ + struct spdk_posix_sock *sock = __posix_sock(_sock); + struct sockaddr_storage sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + + memset(&sa, 0, sizeof sa); + salen = sizeof sa; + rc = getsockname(sock->fd, (struct sockaddr *) &sa, &salen); + if (rc != 0) { + SPDK_ERRLOG("getsockname() failed (errno=%d)\n", errno); + return false; + } + + return (sa.ss_family == AF_INET); +} + +static struct spdk_sock_group_impl * +spdk_posix_sock_group_impl_create(void) +{ + struct spdk_posix_sock_group_impl *group_impl; + int fd; + +#if defined(__linux__) + fd = epoll_create1(0); +#elif defined(__FreeBSD__) + fd = kqueue(); +#endif + if (fd == -1) { + return NULL; + } + + group_impl = calloc(1, sizeof(*group_impl)); + if (group_impl == NULL) { + SPDK_ERRLOG("group_impl allocation failed\n"); + close(fd); + return NULL; + } + + group_impl->fd = fd; + + return &group_impl->base; +} + +static int +spdk_posix_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +{ + struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); + struct spdk_posix_sock *sock = __posix_sock(_sock); + int rc; + +#if defined(__linux__) + struct epoll_event event; + + event.events = EPOLLIN; + event.data.ptr = sock; + + rc = epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); +#elif defined(__FreeBSD__) + struct kevent event; + struct timespec ts = {0}; + + EV_SET(&event, sock->fd, EVFILT_READ, EV_ADD, 0, 0, sock); + + rc = kevent(group->fd, &event, 1, NULL, 0, &ts); +#endif + return rc; +} + +static int +spdk_posix_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +{ + struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); + struct spdk_posix_sock *sock = __posix_sock(_sock); + int rc; +#if defined(__linux__) + struct epoll_event event; + + /* Event parameter is ignored but some old kernel version still require it. */ + rc = epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); +#elif defined(__FreeBSD__) + struct kevent event; + struct timespec ts = {0}; + + EV_SET(&event, sock->fd, EVFILT_READ, EV_DELETE, 0, 0, NULL); + + rc = kevent(group->fd, &event, 1, NULL, 0, &ts); + if (rc == 0 && event.flags & EV_ERROR) { + rc = -1; + errno = event.data; + } +#endif + return rc; +} + +static int +spdk_posix_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, + struct spdk_sock **socks) +{ + struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); + int num_events, i; + +#if defined(__linux__) + struct epoll_event events[MAX_EVENTS_PER_POLL]; + + num_events = epoll_wait(group->fd, events, max_events, 0); +#elif defined(__FreeBSD__) + struct kevent events[MAX_EVENTS_PER_POLL]; + struct timespec ts = {0}; + + num_events = kevent(group->fd, NULL, 0, events, max_events, &ts); +#endif + + if (num_events == -1) { + return -1; + } + + for (i = 0; i < num_events; i++) { +#if defined(__linux__) + socks[i] = events[i].data.ptr; +#elif defined(__FreeBSD__) + socks[i] = events[i].udata; +#endif + } + + return num_events; +} + +static int +spdk_posix_sock_group_impl_close(struct spdk_sock_group_impl *_group) +{ + struct spdk_posix_sock_group_impl *group = __posix_group_impl(_group); + + return close(group->fd); +} + +static struct spdk_net_impl g_posix_net_impl = { + .name = "posix", + .getaddr = spdk_posix_sock_getaddr, + .connect = spdk_posix_sock_connect, + .listen = spdk_posix_sock_listen, + .accept = spdk_posix_sock_accept, + .close = spdk_posix_sock_close, + .recv = spdk_posix_sock_recv, + .writev = spdk_posix_sock_writev, + .set_recvlowat = spdk_posix_sock_set_recvlowat, + .set_recvbuf = spdk_posix_sock_set_recvbuf, + .set_sendbuf = spdk_posix_sock_set_sendbuf, + .is_ipv6 = spdk_posix_sock_is_ipv6, + .is_ipv4 = spdk_posix_sock_is_ipv4, + .group_impl_create = spdk_posix_sock_group_impl_create, + .group_impl_add_sock = spdk_posix_sock_group_impl_add_sock, + .group_impl_remove_sock = spdk_posix_sock_group_impl_remove_sock, + .group_impl_poll = spdk_posix_sock_group_impl_poll, + .group_impl_close = spdk_posix_sock_group_impl_close, +}; + +SPDK_NET_IMPL_REGISTER(posix, &g_posix_net_impl); diff --git a/src/spdk/lib/sock/sock.c b/src/spdk/lib/sock/sock.c new file mode 100644 index 00000000..d31aa9b0 --- /dev/null +++ b/src/spdk/lib/sock/sock.c @@ -0,0 +1,373 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/log.h" +#include "spdk/sock.h" +#include "spdk_internal/sock.h" +#include "spdk/queue.h" + +static STAILQ_HEAD(, spdk_net_impl) g_net_impls = STAILQ_HEAD_INITIALIZER(g_net_impls); + +int +spdk_sock_getaddr(struct spdk_sock *sock, char *saddr, int slen, uint16_t *sport, + char *caddr, int clen, uint16_t *cport) +{ + return sock->net_impl->getaddr(sock, saddr, slen, sport, caddr, clen, cport); +} + +struct spdk_sock * +spdk_sock_connect(const char *ip, int port) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock *sock; + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + sock = impl->connect(ip, port); + if (sock != NULL) { + sock->net_impl = impl; + return sock; + } + } + + return NULL; +} + +struct spdk_sock * +spdk_sock_listen(const char *ip, int port) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock *sock; + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + sock = impl->listen(ip, port); + if (sock != NULL) { + sock->net_impl = impl; + return sock; + } + } + + return NULL; +} + +struct spdk_sock * +spdk_sock_accept(struct spdk_sock *sock) +{ + struct spdk_sock *new_sock; + + new_sock = sock->net_impl->accept(sock); + if (new_sock != NULL) { + new_sock->net_impl = sock->net_impl; + } + + return new_sock; +} + +int +spdk_sock_close(struct spdk_sock **sock) +{ + int rc; + + if (*sock == NULL) { + errno = EBADF; + return -1; + } + + if ((*sock)->cb_fn != NULL) { + /* This sock is still part of a sock_group. */ + errno = EBUSY; + return -1; + } + + rc = (*sock)->net_impl->close(*sock); + if (rc == 0) { + *sock = NULL; + } + + return rc; +} + +ssize_t +spdk_sock_recv(struct spdk_sock *sock, void *buf, size_t len) +{ + if (sock == NULL) { + errno = EBADF; + return -1; + } + + return sock->net_impl->recv(sock, buf, len); +} + +ssize_t +spdk_sock_writev(struct spdk_sock *sock, struct iovec *iov, int iovcnt) +{ + if (sock == NULL) { + errno = EBADF; + return -1; + } + + return sock->net_impl->writev(sock, iov, iovcnt); +} + + +int +spdk_sock_set_recvlowat(struct spdk_sock *sock, int nbytes) +{ + return sock->net_impl->set_recvlowat(sock, nbytes); +} + +int +spdk_sock_set_recvbuf(struct spdk_sock *sock, int sz) +{ + return sock->net_impl->set_recvbuf(sock, sz); +} + +int +spdk_sock_set_sendbuf(struct spdk_sock *sock, int sz) +{ + return sock->net_impl->set_sendbuf(sock, sz); +} + +bool +spdk_sock_is_ipv6(struct spdk_sock *sock) +{ + return sock->net_impl->is_ipv6(sock); +} + +bool +spdk_sock_is_ipv4(struct spdk_sock *sock) +{ + return sock->net_impl->is_ipv4(sock); +} + +struct spdk_sock_group * +spdk_sock_group_create(void) +{ + struct spdk_net_impl *impl = NULL; + struct spdk_sock_group *group; + struct spdk_sock_group_impl *group_impl; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + return NULL; + } + + STAILQ_INIT(&group->group_impls); + + STAILQ_FOREACH_FROM(impl, &g_net_impls, link) { + group_impl = impl->group_impl_create(); + if (group_impl != NULL) { + STAILQ_INSERT_TAIL(&group->group_impls, group_impl, link); + TAILQ_INIT(&group_impl->socks); + group_impl->net_impl = impl; + } + } + + return group; +} + +int +spdk_sock_group_add_sock(struct spdk_sock_group *group, struct spdk_sock *sock, + spdk_sock_cb cb_fn, void *cb_arg) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc; + + if (cb_fn == NULL) { + errno = EINVAL; + return -1; + } + + if (sock->cb_fn != NULL) { + /* + * This sock is already part of a sock_group. Currently we don't + * support this. + */ + errno = EBUSY; + return -1; + } + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + if (sock->net_impl == group_impl->net_impl) { + break; + } + } + + if (group_impl == NULL) { + errno = EINVAL; + return -1; + } + + rc = group_impl->net_impl->group_impl_add_sock(group_impl, sock); + if (rc == 0) { + TAILQ_INSERT_TAIL(&group_impl->socks, sock, link); + sock->cb_fn = cb_fn; + sock->cb_arg = cb_arg; + } + + return rc; +} + +int +spdk_sock_group_remove_sock(struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc; + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + if (sock->net_impl == group_impl->net_impl) { + break; + } + } + + if (group_impl == NULL) { + errno = EINVAL; + return -1; + } + + rc = group_impl->net_impl->group_impl_remove_sock(group_impl, sock); + if (rc == 0) { + TAILQ_REMOVE(&group_impl->socks, sock, link); + sock->cb_fn = NULL; + sock->cb_arg = NULL; + } + + return rc; +} + +int +spdk_sock_group_poll(struct spdk_sock_group *group) +{ + return spdk_sock_group_poll_count(group, MAX_EVENTS_PER_POLL); +} + +static int +spdk_sock_group_impl_poll_count(struct spdk_sock_group_impl *group_impl, + struct spdk_sock_group *group, + int max_events) +{ + struct spdk_sock *socks[MAX_EVENTS_PER_POLL]; + int num_events, i; + + if (TAILQ_EMPTY(&group_impl->socks)) { + return 0; + } + + num_events = group_impl->net_impl->group_impl_poll(group_impl, max_events, socks); + if (num_events == -1) { + return -1; + } + + for (i = 0; i < num_events; i++) { + struct spdk_sock *sock = socks[i]; + + assert(sock->cb_fn != NULL); + sock->cb_fn(sock->cb_arg, group, sock); + } + return 0; +} + +int +spdk_sock_group_poll_count(struct spdk_sock_group *group, int max_events) +{ + struct spdk_sock_group_impl *group_impl = NULL; + int rc, final_rc = 0; + + if (max_events < 1) { + errno = -EINVAL; + return -1; + } + + /* + * Only poll for up to 32 events at a time - if more events are pending, + * the next call to this function will reap them. + */ + if (max_events > MAX_EVENTS_PER_POLL) { + max_events = MAX_EVENTS_PER_POLL; + } + + STAILQ_FOREACH_FROM(group_impl, &group->group_impls, link) { + rc = spdk_sock_group_impl_poll_count(group_impl, group, max_events); + if (rc != 0) { + final_rc = rc; + SPDK_ERRLOG("group_impl_poll_count for net(%s) failed\n", + group_impl->net_impl->name); + } + } + + return final_rc; +} + +int +spdk_sock_group_close(struct spdk_sock_group **group) +{ + struct spdk_sock_group_impl *group_impl = NULL, *tmp; + int rc; + + if (*group == NULL) { + errno = EBADF; + return -1; + } + + STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) { + if (!TAILQ_EMPTY(&group_impl->socks)) { + errno = EBUSY; + return -1; + } + } + + STAILQ_FOREACH_SAFE(group_impl, &(*group)->group_impls, link, tmp) { + rc = group_impl->net_impl->group_impl_close(group_impl); + if (rc != 0) { + SPDK_ERRLOG("group_impl_close for net(%s) failed\n", + group_impl->net_impl->name); + } + free(group_impl); + } + + free(*group); + *group = NULL; + + return 0; +} + +void +spdk_net_impl_register(struct spdk_net_impl *impl) +{ + if (!strcmp("posix", impl->name)) { + STAILQ_INSERT_TAIL(&g_net_impls, impl, link); + } else { + STAILQ_INSERT_HEAD(&g_net_impls, impl, link); + } +} diff --git a/src/spdk/lib/sock/vpp/Makefile b/src/spdk/lib/sock/vpp/Makefile new file mode 100644 index 00000000..614fd2e3 --- /dev/null +++ b/src/spdk/lib/sock/vpp/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS += vpp.c + +LIBNAME = sock_vpp + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/sock/vpp/vpp.c b/src/spdk/lib/sock/vpp/vpp.c new file mode 100644 index 00000000..752250eb --- /dev/null +++ b/src/spdk/lib/sock/vpp/vpp.c @@ -0,0 +1,663 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/log.h" +#include "spdk/sock.h" +#include "spdk/net.h" +#include "spdk/string.h" +#include "spdk_internal/sock.h" +#include + +#define MAX_TMPBUF 1024 +#define PORTNUMLEN 32 + +static bool g_vpp_initialized = false; + +struct spdk_vpp_sock { + struct spdk_sock base; + int fd; +}; + +struct spdk_vpp_sock_group_impl { + struct spdk_sock_group_impl base; + int fd; +}; + +static int +get_addr_str(struct sockaddr *sa, char *host, size_t hlen) +{ + const char *result = NULL; + + if (sa == NULL || host == NULL) { + return -1; + } + + if (sa->sa_family == AF_INET) { + result = inet_ntop(AF_INET, &(((struct sockaddr_in *)sa)->sin_addr), + host, hlen); + } else { + result = inet_ntop(AF_INET6, &(((struct sockaddr_in6 *)sa)->sin6_addr), + host, hlen); + } + + if (result == NULL) { + return -1; + } + + return 0; +} + +#define __vpp_sock(sock) (struct spdk_vpp_sock *)sock +#define __vpp_group_impl(group) (struct spdk_vpp_sock_group_impl *)group + +static inline void +vcom_socket_copy_ep_to_sockaddr(struct sockaddr *addr, socklen_t *len, vppcom_endpt_t *ep) +{ + int sa_len, copy_len; + + assert(ep->vrf == VPPCOM_VRF_DEFAULT); + + if (ep->is_ip4 == VPPCOM_IS_IP4) { + addr->sa_family = AF_INET; + ((struct sockaddr_in *) addr)->sin_port = ep->port; + if (*len > sizeof(struct sockaddr_in)) { + *len = sizeof(struct sockaddr_in); + } + sa_len = sizeof(struct sockaddr_in) - sizeof(struct in_addr); + copy_len = *len - sa_len; + if (copy_len > 0) { + memcpy(&((struct sockaddr_in *) addr)->sin_addr, ep->ip, copy_len); + } + } else { + addr->sa_family = AF_INET6; + ((struct sockaddr_in6 *) addr)->sin6_port = ep->port; + if (*len > sizeof(struct sockaddr_in6)) { + *len = sizeof(struct sockaddr_in6); + } + sa_len = sizeof(struct sockaddr_in6) - sizeof(struct in6_addr); + copy_len = *len - sa_len; + if (copy_len > 0) { + memcpy(&((struct sockaddr_in6 *) addr)->sin6_addr, ep->ip, copy_len); + } + } +} + +static int +getsockname_vpp(int fd, struct sockaddr *addr, socklen_t *len) +{ + vppcom_endpt_t ep; + uint32_t size = sizeof(ep); + uint8_t addr_buf[sizeof(struct in6_addr)]; + int rc; + + if (!addr || !len) { + return -EFAULT; + } + + ep.ip = addr_buf; + + rc = vppcom_session_attr(fd, VPPCOM_ATTR_GET_LCL_ADDR, &ep, &size); + if (rc == VPPCOM_OK) { + vcom_socket_copy_ep_to_sockaddr(addr, len, &ep); + } + + return rc; +} + + +static int +getpeername_vpp(int sock, struct sockaddr *addr, socklen_t *len) +{ + vppcom_endpt_t ep; + uint32_t size = sizeof(ep); + uint8_t addr_buf[sizeof(struct in6_addr)]; + int rc; + + if (!addr || !len) { + return -EFAULT; + } + + ep.ip = addr_buf; + + rc = vppcom_session_attr(sock, VPPCOM_ATTR_GET_PEER_ADDR, &ep, &size); + if (rc == VPPCOM_OK) { + vcom_socket_copy_ep_to_sockaddr(addr, len, &ep); + } + + return rc; +} + +static int +spdk_vpp_sock_getaddr(struct spdk_sock *_sock, char *saddr, int slen, uint16_t *sport, + char *caddr, int clen, uint16_t *cport) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + struct sockaddr sa; + socklen_t salen; + int rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + memset(&sa, 0, sizeof(sa)); + salen = sizeof(sa); + rc = getsockname_vpp(sock->fd, &sa, &salen); + if (rc != 0) { + errno = -rc; + SPDK_ERRLOG("getsockname_vpp() failed (errno=%d)\n", errno); + return -1; + } + + rc = get_addr_str(&sa, saddr, slen); + if (rc != 0) { + /* Errno already set by get_addr_str() */ + SPDK_ERRLOG("get_addr_str() failed (errno=%d)\n", errno); + return -1; + } + + if (sport) { + if (sa.ss_family == AF_INET) { + *sport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *sport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + memset(&sa, 0, sizeof(sa)); + salen = sizeof(sa); + rc = getpeername_vpp(sock->fd, &sa, &salen); + if (rc != 0) { + errno = -rc; + SPDK_ERRLOG("getpeername_vpp() failed (errno=%d)\n", errno); + return -1; + } + + rc = get_addr_str(&sa, caddr, clen); + if (rc != 0) { + /* Errno already set by get_addr_str() */ + SPDK_ERRLOG("get_addr_str() failed (errno=%d)\n", errno); + return -1; + } + + if (cport) { + if (sa.ss_family == AF_INET) { + *cport = ntohs(((struct sockaddr_in *) &sa)->sin_port); + } else if (sa.ss_family == AF_INET6) { + *cport = ntohs(((struct sockaddr_in6 *) &sa)->sin6_port); + } + } + + return 0; +} + +enum spdk_vpp_create_type { + SPDK_SOCK_CREATE_LISTEN, + SPDK_SOCK_CREATE_CONNECT, +}; + +static struct spdk_sock * +spdk_vpp_sock_create(const char *ip, int port, enum spdk_vpp_create_type type) +{ + struct spdk_vpp_sock *sock; + int fd, rc; + vppcom_endpt_t endpt; + uint8_t addr_buf[sizeof(struct in6_addr)]; + + if (ip == NULL) { + return NULL; + } + + /* Check address family */ + if (inet_pton(AF_INET, ip, &addr_buf)) { + endpt.is_ip4 = VPPCOM_IS_IP4; + } else if (inet_pton(AF_INET6, ip, &addr_buf)) { + endpt.is_ip4 = VPPCOM_IS_IP6; + } else { + SPDK_ERRLOG("IP address with invalid format\n"); + return NULL; + } + endpt.vrf = VPPCOM_VRF_DEFAULT; + endpt.ip = (uint8_t *)&addr_buf; + endpt.port = htons(port); + + fd = vppcom_session_create(VPPCOM_VRF_DEFAULT, VPPCOM_PROTO_TCP, 1 /* is_nonblocking */); + if (fd < 0) { + errno = -fd; + SPDK_ERRLOG("vppcom_session_create() failed, errno = %d\n", errno); + return NULL; + } + + if (type == SPDK_SOCK_CREATE_LISTEN) { + rc = vppcom_session_bind(fd, &endpt); + if (rc != VPPCOM_OK) { + errno = -rc; + SPDK_ERRLOG("vppcom_session_bind() failed, errno = %d\n", errno); + vppcom_session_close(fd); + return NULL; + } + + rc = vppcom_session_listen(fd, 512); + if (rc != VPPCOM_OK) { + errno = -rc; + SPDK_ERRLOG("vppcom_session_listen() failed, errno = %d\n", errno); + vppcom_session_close(fd); + return NULL; + } + } else if (type == SPDK_SOCK_CREATE_CONNECT) { + rc = vppcom_session_connect(fd, &endpt); + if (rc != VPPCOM_OK) { + errno = -rc; + SPDK_ERRLOG("vppcom_session_connect() failed, errno = %d\n", errno); + vppcom_session_close(fd); + return NULL; + } + } + + sock = calloc(1, sizeof(*sock)); + if (sock == NULL) { + errno = -ENOMEM; + SPDK_ERRLOG("sock allocation failed\n"); + vppcom_session_close(fd); + return NULL; + } + + sock->fd = fd; + return &sock->base; +} + +static struct spdk_sock * +spdk_vpp_sock_listen(const char *ip, int port) +{ + if (!g_vpp_initialized) { + return NULL; + } + + return spdk_vpp_sock_create(ip, port, SPDK_SOCK_CREATE_LISTEN); +} + +static struct spdk_sock * +spdk_vpp_sock_connect(const char *ip, int port) +{ + if (!g_vpp_initialized) { + return NULL; + } + + return spdk_vpp_sock_create(ip, port, SPDK_SOCK_CREATE_CONNECT); +} + +static struct spdk_sock * +spdk_vpp_sock_accept(struct spdk_sock *_sock) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + vppcom_endpt_t endpt; + uint8_t ip[16]; + int rc; + struct spdk_vpp_sock *new_sock; + double wait_time = -1.0; + + endpt.ip = ip; + + assert(sock != NULL); + assert(g_vpp_initialized); + + rc = vppcom_session_accept(sock->fd, &endpt, O_NONBLOCK, wait_time); + if (rc < 0) { + errno = -rc; + return NULL; + } + + new_sock = calloc(1, sizeof(*sock)); + if (new_sock == NULL) { + SPDK_ERRLOG("sock allocation failed\n"); + vppcom_session_close(rc); + return NULL; + } + + new_sock->fd = rc; + return &new_sock->base; +} + +static int +spdk_vpp_sock_close(struct spdk_sock *_sock) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + int rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + rc = vppcom_session_close(sock->fd); + if (rc != VPPCOM_OK) { + errno = -rc; + return -1; + } + free(sock); + + return 0; +} + +static ssize_t +spdk_vpp_sock_recv(struct spdk_sock *_sock, void *buf, size_t len) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + int rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + rc = vppcom_session_read(sock->fd, buf, len); + if (rc < 0) { + errno = -rc; + return -1; + } + return rc; +} + +static ssize_t +spdk_vpp_sock_writev(struct spdk_sock *_sock, struct iovec *iov, int iovcnt) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + ssize_t total = 0; + int i, rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + for (i = 0; i < iovcnt; ++i) { + rc = vppcom_session_write(sock->fd, iov[i].iov_base, iov[i].iov_len); + if (rc < 0) { + if (total > 0) { + break; + } else { + errno = -rc; + return -1; + } + } else { + total += rc; + } + } + return total; +} + + +/* + * TODO: Check if there are similar parameters to configure in VPP + * to three below. + */ +static int +spdk_vpp_sock_set_recvlowat(struct spdk_sock *_sock, int nbytes) +{ + assert(g_vpp_initialized); + + return 0; +} + +static int +spdk_vpp_sock_set_recvbuf(struct spdk_sock *_sock, int sz) +{ + assert(g_vpp_initialized); + + return 0; +} + +static int +spdk_vpp_sock_set_sendbuf(struct spdk_sock *_sock, int sz) +{ + assert(g_vpp_initialized); + + return 0; +} + +static bool +spdk_vpp_sock_is_ipv6(struct spdk_sock *_sock) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + vppcom_endpt_t ep; + uint32_t size = sizeof(ep); + uint8_t addr_buf[sizeof(struct in6_addr)]; + int rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + ep.ip = addr_buf; + + rc = vppcom_session_attr(sock->fd, VPPCOM_ATTR_GET_LCL_ADDR, &ep, &size); + if (rc != VPPCOM_OK) { + errno = -rc; + return false; + } + + return (ep.is_ip4 == VPPCOM_IS_IP6); +} + +static bool +spdk_vpp_sock_is_ipv4(struct spdk_sock *_sock) +{ + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + vppcom_endpt_t ep; + uint32_t size = sizeof(ep); + uint8_t addr_buf[sizeof(struct in6_addr)]; + int rc; + + assert(sock != NULL); + assert(g_vpp_initialized); + + ep.ip = addr_buf; + + rc = vppcom_session_attr(sock->fd, VPPCOM_ATTR_GET_LCL_ADDR, &ep, &size); + if (rc != VPPCOM_OK) { + errno = -rc; + return false; + } + + return (ep.is_ip4 == VPPCOM_IS_IP4); +} + +static struct spdk_sock_group_impl * +spdk_vpp_sock_group_impl_create(void) +{ + struct spdk_vpp_sock_group_impl *group_impl; + int fd; + + if (!g_vpp_initialized) { + return NULL; + } + + group_impl = calloc(1, sizeof(*group_impl)); + if (group_impl == NULL) { + SPDK_ERRLOG("sock_group allocation failed\n"); + return NULL; + } + + fd = vppcom_epoll_create(); + if (fd < 0) { + free(group_impl); + return NULL; + } + + group_impl->fd = fd; + + return &group_impl->base; +} + +static int +spdk_vpp_sock_group_impl_add_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +{ + struct spdk_vpp_sock_group_impl *group = __vpp_group_impl(_group); + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + int rc; + struct epoll_event event; + + assert(sock != NULL); + assert(group != NULL); + assert(g_vpp_initialized); + + event.events = EPOLLIN; + event.data.ptr = sock; + + rc = vppcom_epoll_ctl(group->fd, EPOLL_CTL_ADD, sock->fd, &event); + if (rc != VPPCOM_OK) { + errno = -rc; + return -1; + } + + return 0; +} + +static int +spdk_vpp_sock_group_impl_remove_sock(struct spdk_sock_group_impl *_group, struct spdk_sock *_sock) +{ + struct spdk_vpp_sock_group_impl *group = __vpp_group_impl(_group); + struct spdk_vpp_sock *sock = __vpp_sock(_sock); + int rc; + struct epoll_event event; + + assert(sock != NULL); + assert(group != NULL); + assert(g_vpp_initialized); + + rc = vppcom_epoll_ctl(group->fd, EPOLL_CTL_DEL, sock->fd, &event); + if (rc != VPPCOM_OK) { + errno = -rc; + return -1; + } + + return 0; +} + +static int +spdk_vpp_sock_group_impl_poll(struct spdk_sock_group_impl *_group, int max_events, + struct spdk_sock **socks) +{ + struct spdk_vpp_sock_group_impl *group = __vpp_group_impl(_group); + int num_events, i; + struct epoll_event events[MAX_EVENTS_PER_POLL]; + + assert(group != NULL); + assert(socks != NULL); + assert(g_vpp_initialized); + + num_events = vppcom_epoll_wait(group->fd, events, max_events, 0); + if (num_events < 0) { + errno = -num_events; + return -1; + } + + for (i = 0; i < num_events; i++) { + socks[i] = events[i].data.ptr; + } + + return num_events; +} + +static int +spdk_vpp_sock_group_impl_close(struct spdk_sock_group_impl *_group) +{ + struct spdk_vpp_sock_group_impl *group = __vpp_group_impl(_group); + int rc; + + assert(group != NULL); + assert(g_vpp_initialized); + + rc = vppcom_session_close(group->fd); + if (rc != VPPCOM_OK) { + errno = -rc; + return -1; + } + + return 0; +} + +static struct spdk_net_impl g_vpp_net_impl = { + .name = "vpp", + .getaddr = spdk_vpp_sock_getaddr, + .connect = spdk_vpp_sock_connect, + .listen = spdk_vpp_sock_listen, + .accept = spdk_vpp_sock_accept, + .close = spdk_vpp_sock_close, + .recv = spdk_vpp_sock_recv, + .writev = spdk_vpp_sock_writev, + .set_recvlowat = spdk_vpp_sock_set_recvlowat, + .set_recvbuf = spdk_vpp_sock_set_recvbuf, + .set_sendbuf = spdk_vpp_sock_set_sendbuf, + .is_ipv6 = spdk_vpp_sock_is_ipv6, + .is_ipv4 = spdk_vpp_sock_is_ipv4, + .group_impl_create = spdk_vpp_sock_group_impl_create, + .group_impl_add_sock = spdk_vpp_sock_group_impl_add_sock, + .group_impl_remove_sock = spdk_vpp_sock_group_impl_remove_sock, + .group_impl_poll = spdk_vpp_sock_group_impl_poll, + .group_impl_close = spdk_vpp_sock_group_impl_close, +}; + +SPDK_NET_IMPL_REGISTER(vpp, &g_vpp_net_impl); + +static int +spdk_vpp_net_framework_init(void) +{ + int rc; + char *app_name; + + app_name = spdk_sprintf_alloc("SPDK_%d", getpid()); + if (app_name == NULL) { + SPDK_ERRLOG("Cannot alloc memory for SPDK app name\n"); + return -ENOMEM; + } + + rc = vppcom_app_create(app_name); + if (rc == 0) { + g_vpp_initialized = true; + } + + free(app_name); + + return 0; +} + +static void +spdk_vpp_net_framework_fini(void) +{ + if (g_vpp_initialized) { + vppcom_app_destroy(); + } +} + +static struct spdk_net_framework g_vpp_net_framework = { + .name = "vpp", + .init = spdk_vpp_net_framework_init, + .fini = spdk_vpp_net_framework_fini, +}; + +SPDK_NET_FRAMEWORK_REGISTER(vpp, &g_vpp_net_framework); diff --git a/src/spdk/lib/thread/Makefile b/src/spdk/lib/thread/Makefile new file mode 100644 index 00000000..467e32ff --- /dev/null +++ b/src/spdk/lib/thread/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = thread.c +LIBNAME = thread + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/thread/thread.c b/src/spdk/lib/thread/thread.c new file mode 100644 index 00000000..c014f4ed --- /dev/null +++ b/src/spdk/lib/thread/thread.c @@ -0,0 +1,768 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" +#include "spdk/thread.h" + +#include "spdk_internal/log.h" + +#ifdef __linux__ +#include +#endif + +#ifdef __FreeBSD__ +#include +#endif + +static pthread_mutex_t g_devlist_mutex = PTHREAD_MUTEX_INITIALIZER; + +struct io_device { + void *io_device; + char *name; + spdk_io_channel_create_cb create_cb; + spdk_io_channel_destroy_cb destroy_cb; + spdk_io_device_unregister_cb unregister_cb; + struct spdk_thread *unregister_thread; + uint32_t ctx_size; + uint32_t for_each_count; + TAILQ_ENTRY(io_device) tailq; + + uint32_t refcnt; + + bool unregistered; +}; + +static TAILQ_HEAD(, io_device) g_io_devices = TAILQ_HEAD_INITIALIZER(g_io_devices); + +struct spdk_thread { + pthread_t thread_id; + spdk_thread_pass_msg msg_fn; + spdk_start_poller start_poller_fn; + spdk_stop_poller stop_poller_fn; + void *thread_ctx; + TAILQ_HEAD(, spdk_io_channel) io_channels; + TAILQ_ENTRY(spdk_thread) tailq; + char *name; +}; + +static TAILQ_HEAD(, spdk_thread) g_threads = TAILQ_HEAD_INITIALIZER(g_threads); +static uint32_t g_thread_count = 0; + +static struct spdk_thread * +_get_thread(void) +{ + pthread_t thread_id; + struct spdk_thread *thread; + + thread_id = pthread_self(); + + thread = NULL; + TAILQ_FOREACH(thread, &g_threads, tailq) { + if (thread->thread_id == thread_id) { + return thread; + } + } + + return NULL; +} + +static void +_set_thread_name(const char *thread_name) +{ +#if defined(__linux__) + prctl(PR_SET_NAME, thread_name, 0, 0, 0); +#elif defined(__FreeBSD__) + pthread_set_name_np(pthread_self(), thread_name); +#else +#error missing platform support for thread name +#endif +} + +int +spdk_thread_lib_init(void) +{ + return 0; +} + +void +spdk_thread_lib_fini(void) +{ +} + +struct spdk_thread * +spdk_allocate_thread(spdk_thread_pass_msg msg_fn, + spdk_start_poller start_poller_fn, + spdk_stop_poller stop_poller_fn, + void *thread_ctx, const char *name) +{ + struct spdk_thread *thread; + + pthread_mutex_lock(&g_devlist_mutex); + + thread = _get_thread(); + if (thread) { + SPDK_ERRLOG("Double allocated SPDK thread\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + thread = calloc(1, sizeof(*thread)); + if (!thread) { + SPDK_ERRLOG("Unable to allocate memory for thread\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + thread->thread_id = pthread_self(); + thread->msg_fn = msg_fn; + thread->start_poller_fn = start_poller_fn; + thread->stop_poller_fn = stop_poller_fn; + thread->thread_ctx = thread_ctx; + TAILQ_INIT(&thread->io_channels); + TAILQ_INSERT_TAIL(&g_threads, thread, tailq); + g_thread_count++; + if (name) { + _set_thread_name(name); + thread->name = strdup(name); + } else { + thread->name = spdk_sprintf_alloc("%p", thread); + } + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Allocating new thread %s\n", thread->name); + + pthread_mutex_unlock(&g_devlist_mutex); + + return thread; +} + +void +spdk_free_thread(void) +{ + struct spdk_thread *thread; + + pthread_mutex_lock(&g_devlist_mutex); + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Freeing thread %s\n", thread->name); + + assert(g_thread_count > 0); + g_thread_count--; + TAILQ_REMOVE(&g_threads, thread, tailq); + free(thread->name); + free(thread); + + pthread_mutex_unlock(&g_devlist_mutex); +} + +uint32_t +spdk_thread_get_count(void) +{ + /* + * Return cached value of the current thread count. We could acquire the + * lock and iterate through the TAILQ of threads to count them, but that + * count could still be invalidated after we release the lock. + */ + return g_thread_count; +} + +struct spdk_thread * +spdk_get_thread(void) +{ + struct spdk_thread *thread; + + pthread_mutex_lock(&g_devlist_mutex); + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + } + + pthread_mutex_unlock(&g_devlist_mutex); + + return thread; +} + +const char * +spdk_thread_get_name(const struct spdk_thread *thread) +{ + return thread->name; +} + +void +spdk_thread_send_msg(const struct spdk_thread *thread, spdk_thread_fn fn, void *ctx) +{ + thread->msg_fn(fn, ctx, thread->thread_ctx); +} + + +struct spdk_poller * +spdk_poller_register(spdk_poller_fn fn, + void *arg, + uint64_t period_microseconds) +{ + struct spdk_thread *thread; + struct spdk_poller *poller; + + thread = spdk_get_thread(); + if (!thread) { + assert(false); + return NULL; + } + + if (!thread->start_poller_fn || !thread->stop_poller_fn) { + SPDK_ERRLOG("No related functions to start requested poller\n"); + assert(false); + return NULL; + } + + poller = thread->start_poller_fn(thread->thread_ctx, fn, arg, period_microseconds); + if (!poller) { + SPDK_ERRLOG("Unable to start requested poller\n"); + assert(false); + return NULL; + } + + return poller; +} + +void +spdk_poller_unregister(struct spdk_poller **ppoller) +{ + struct spdk_thread *thread; + struct spdk_poller *poller; + + poller = *ppoller; + if (poller == NULL) { + return; + } + + *ppoller = NULL; + + thread = spdk_get_thread(); + + if (thread) { + thread->stop_poller_fn(poller, thread->thread_ctx); + } +} + +struct call_thread { + struct spdk_thread *cur_thread; + spdk_thread_fn fn; + void *ctx; + + struct spdk_thread *orig_thread; + spdk_thread_fn cpl; +}; + +static void +spdk_on_thread(void *ctx) +{ + struct call_thread *ct = ctx; + + ct->fn(ct->ctx); + + pthread_mutex_lock(&g_devlist_mutex); + ct->cur_thread = TAILQ_NEXT(ct->cur_thread, tailq); + pthread_mutex_unlock(&g_devlist_mutex); + + if (!ct->cur_thread) { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Completed thread iteration\n"); + + spdk_thread_send_msg(ct->orig_thread, ct->cpl, ct->ctx); + free(ctx); + } else { + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Continuing thread iteration to %s\n", + ct->cur_thread->name); + + spdk_thread_send_msg(ct->cur_thread, spdk_on_thread, ctx); + } +} + +void +spdk_for_each_thread(spdk_thread_fn fn, void *ctx, spdk_thread_fn cpl) +{ + struct call_thread *ct; + + ct = calloc(1, sizeof(*ct)); + if (!ct) { + SPDK_ERRLOG("Unable to perform thread iteration\n"); + cpl(ctx); + return; + } + + ct->fn = fn; + ct->ctx = ctx; + ct->cpl = cpl; + + pthread_mutex_lock(&g_devlist_mutex); + ct->orig_thread = _get_thread(); + ct->cur_thread = TAILQ_FIRST(&g_threads); + pthread_mutex_unlock(&g_devlist_mutex); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Starting thread iteration from %s\n", + ct->orig_thread->name); + + spdk_thread_send_msg(ct->cur_thread, spdk_on_thread, ct); +} + +void +spdk_io_device_register(void *io_device, spdk_io_channel_create_cb create_cb, + spdk_io_channel_destroy_cb destroy_cb, uint32_t ctx_size, + const char *name) +{ + struct io_device *dev, *tmp; + + assert(io_device != NULL); + assert(create_cb != NULL); + assert(destroy_cb != NULL); + + dev = calloc(1, sizeof(struct io_device)); + if (dev == NULL) { + SPDK_ERRLOG("could not allocate io_device\n"); + return; + } + + dev->io_device = io_device; + if (name) { + dev->name = strdup(name); + } else { + dev->name = spdk_sprintf_alloc("%p", dev); + } + dev->create_cb = create_cb; + dev->destroy_cb = destroy_cb; + dev->unregister_cb = NULL; + dev->ctx_size = ctx_size; + dev->for_each_count = 0; + dev->unregistered = false; + dev->refcnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Registering io_device %s (%p) on thread %s\n", + dev->name, dev->io_device, spdk_get_thread()->name); + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(tmp, &g_io_devices, tailq) { + if (tmp->io_device == io_device) { + SPDK_ERRLOG("io_device %p already registered\n", io_device); + free(dev->name); + free(dev); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + } + TAILQ_INSERT_TAIL(&g_io_devices, dev, tailq); + pthread_mutex_unlock(&g_devlist_mutex); +} + +static void +_finish_unregister(void *arg) +{ + struct io_device *dev = arg; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Finishing unregistration of io_device %s (%p) on thread %s\n", + dev->name, dev->io_device, dev->unregister_thread->name); + + dev->unregister_cb(dev->io_device); + free(dev->name); + free(dev); +} + +static void +_spdk_io_device_free(struct io_device *dev) +{ + if (dev->unregister_cb == NULL) { + free(dev->name); + free(dev); + } else { + assert(dev->unregister_thread != NULL); + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "io_device %s (%p) needs to unregister from thread %s\n", + dev->name, dev->io_device, dev->unregister_thread->name); + spdk_thread_send_msg(dev->unregister_thread, _finish_unregister, dev); + } +} + +void +spdk_io_device_unregister(void *io_device, spdk_io_device_unregister_cb unregister_cb) +{ + struct io_device *dev; + uint32_t refcnt; + struct spdk_thread *thread; + + thread = spdk_get_thread(); + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(dev, &g_io_devices, tailq) { + if (dev->io_device == io_device) { + break; + } + } + + if (!dev) { + SPDK_ERRLOG("io_device %p not found\n", io_device); + assert(false); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + + if (dev->for_each_count > 0) { + SPDK_ERRLOG("io_device %p has %u for_each calls outstanding\n", io_device, dev->for_each_count); + pthread_mutex_unlock(&g_devlist_mutex); + return; + } + + dev->unregister_cb = unregister_cb; + dev->unregistered = true; + TAILQ_REMOVE(&g_io_devices, dev, tailq); + refcnt = dev->refcnt; + dev->unregister_thread = thread; + pthread_mutex_unlock(&g_devlist_mutex); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Unregistering io_device %s (%p) from thread %s\n", + dev->name, dev->io_device, thread->name); + + if (refcnt > 0) { + /* defer deletion */ + return; + } + + _spdk_io_device_free(dev); +} + +struct spdk_io_channel * +spdk_get_io_channel(void *io_device) +{ + struct spdk_io_channel *ch; + struct spdk_thread *thread; + struct io_device *dev; + int rc; + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(dev, &g_io_devices, tailq) { + if (dev->io_device == io_device) { + break; + } + } + if (dev == NULL) { + SPDK_ERRLOG("could not find io_device %p\n", io_device); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + thread = _get_thread(); + if (!thread) { + SPDK_ERRLOG("No thread allocated\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev == dev) { + ch->ref++; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, dev->name, dev->io_device, thread->name, ch->ref); + + /* + * An I/O channel already exists for this device on this + * thread, so return it. + */ + pthread_mutex_unlock(&g_devlist_mutex); + return ch; + } + } + + ch = calloc(1, sizeof(*ch) + dev->ctx_size); + if (ch == NULL) { + SPDK_ERRLOG("could not calloc spdk_io_channel\n"); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + ch->dev = dev; + ch->destroy_cb = dev->destroy_cb; + ch->thread = thread; + ch->ref = 1; + ch->destroy_ref = 0; + TAILQ_INSERT_TAIL(&thread->io_channels, ch, tailq); + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, "Get io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, dev->name, dev->io_device, thread->name, ch->ref); + + dev->refcnt++; + + pthread_mutex_unlock(&g_devlist_mutex); + + rc = dev->create_cb(io_device, (uint8_t *)ch + sizeof(*ch)); + if (rc == -1) { + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq); + dev->refcnt--; + free(ch); + pthread_mutex_unlock(&g_devlist_mutex); + return NULL; + } + + return ch; +} + +static void +_spdk_put_io_channel(void *arg) +{ + struct spdk_io_channel *ch = arg; + bool do_remove_dev = true; + + SPDK_DEBUGLOG(SPDK_LOG_THREAD, + "Releasing io_channel %p for io_device %s (%p). Channel thread %p. Current thread %s\n", + ch, ch->dev->name, ch->dev->io_device, ch->thread, spdk_get_thread()->name); + + assert(ch->thread == spdk_get_thread()); + + ch->destroy_ref--; + + if (ch->ref > 0 || ch->destroy_ref > 0) { + /* + * Another reference to the associated io_device was requested + * after this message was sent but before it had a chance to + * execute. + */ + return; + } + + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_REMOVE(&ch->thread->io_channels, ch, tailq); + pthread_mutex_unlock(&g_devlist_mutex); + + /* Don't hold the devlist mutex while the destroy_cb is called. */ + ch->destroy_cb(ch->dev->io_device, spdk_io_channel_get_ctx(ch)); + + pthread_mutex_lock(&g_devlist_mutex); + ch->dev->refcnt--; + + if (!ch->dev->unregistered) { + do_remove_dev = false; + } + + if (ch->dev->refcnt > 0) { + do_remove_dev = false; + } + + pthread_mutex_unlock(&g_devlist_mutex); + + if (do_remove_dev) { + _spdk_io_device_free(ch->dev); + } + free(ch); +} + +void +spdk_put_io_channel(struct spdk_io_channel *ch) +{ + SPDK_DEBUGLOG(SPDK_LOG_THREAD, + "Putting io_channel %p for io_device %s (%p) on thread %s refcnt %u\n", + ch, ch->dev->name, ch->dev->io_device, ch->thread->name, ch->ref); + + ch->ref--; + + if (ch->ref == 0) { + ch->destroy_ref++; + spdk_thread_send_msg(ch->thread, _spdk_put_io_channel, ch); + } +} + +struct spdk_io_channel * +spdk_io_channel_from_ctx(void *ctx) +{ + return (struct spdk_io_channel *)((uint8_t *)ctx - sizeof(struct spdk_io_channel)); +} + +struct spdk_thread * +spdk_io_channel_get_thread(struct spdk_io_channel *ch) +{ + return ch->thread; +} + +struct spdk_io_channel_iter { + void *io_device; + struct io_device *dev; + spdk_channel_msg fn; + int status; + void *ctx; + struct spdk_io_channel *ch; + + struct spdk_thread *cur_thread; + + struct spdk_thread *orig_thread; + spdk_channel_for_each_cpl cpl; +}; + +void * +spdk_io_channel_iter_get_io_device(struct spdk_io_channel_iter *i) +{ + return i->io_device; +} + +struct spdk_io_channel * +spdk_io_channel_iter_get_channel(struct spdk_io_channel_iter *i) +{ + return i->ch; +} + +void * +spdk_io_channel_iter_get_ctx(struct spdk_io_channel_iter *i) +{ + return i->ctx; +} + +static void +_call_completion(void *ctx) +{ + struct spdk_io_channel_iter *i = ctx; + + if (i->cpl != NULL) { + i->cpl(i, i->status); + } + free(i); +} + +static void +_call_channel(void *ctx) +{ + struct spdk_io_channel_iter *i = ctx; + struct spdk_io_channel *ch; + + /* + * It is possible that the channel was deleted before this + * message had a chance to execute. If so, skip calling + * the fn() on this thread. + */ + pthread_mutex_lock(&g_devlist_mutex); + TAILQ_FOREACH(ch, &i->cur_thread->io_channels, tailq) { + if (ch->dev->io_device == i->io_device) { + break; + } + } + pthread_mutex_unlock(&g_devlist_mutex); + + if (ch) { + i->fn(i); + } else { + spdk_for_each_channel_continue(i, 0); + } +} + +void +spdk_for_each_channel(void *io_device, spdk_channel_msg fn, void *ctx, + spdk_channel_for_each_cpl cpl) +{ + struct spdk_thread *thread; + struct spdk_io_channel *ch; + struct spdk_io_channel_iter *i; + + i = calloc(1, sizeof(*i)); + if (!i) { + SPDK_ERRLOG("Unable to allocate iterator\n"); + return; + } + + i->io_device = io_device; + i->fn = fn; + i->ctx = ctx; + i->cpl = cpl; + + pthread_mutex_lock(&g_devlist_mutex); + i->orig_thread = _get_thread(); + + TAILQ_FOREACH(thread, &g_threads, tailq) { + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev->io_device == io_device) { + ch->dev->for_each_count++; + i->dev = ch->dev; + i->cur_thread = thread; + i->ch = ch; + pthread_mutex_unlock(&g_devlist_mutex); + spdk_thread_send_msg(thread, _call_channel, i); + return; + } + } + } + + pthread_mutex_unlock(&g_devlist_mutex); + + spdk_thread_send_msg(i->orig_thread, _call_completion, i); +} + +void +spdk_for_each_channel_continue(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_thread *thread; + struct spdk_io_channel *ch; + + assert(i->cur_thread == spdk_get_thread()); + + i->status = status; + + pthread_mutex_lock(&g_devlist_mutex); + if (status) { + goto end; + } + thread = TAILQ_NEXT(i->cur_thread, tailq); + while (thread) { + TAILQ_FOREACH(ch, &thread->io_channels, tailq) { + if (ch->dev->io_device == i->io_device) { + i->cur_thread = thread; + i->ch = ch; + pthread_mutex_unlock(&g_devlist_mutex); + spdk_thread_send_msg(thread, _call_channel, i); + return; + } + } + thread = TAILQ_NEXT(thread, tailq); + } + +end: + i->dev->for_each_count--; + i->ch = NULL; + pthread_mutex_unlock(&g_devlist_mutex); + + spdk_thread_send_msg(i->orig_thread, _call_completion, i); +} + + +SPDK_LOG_REGISTER_COMPONENT("thread", SPDK_LOG_THREAD) diff --git a/src/spdk/lib/trace/Makefile b/src/spdk/lib/trace/Makefile new file mode 100644 index 00000000..8bd9ec17 --- /dev/null +++ b/src/spdk/lib/trace/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = trace.c trace_flags.c +LIBNAME = trace + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/trace/trace.c b/src/spdk/lib/trace/trace.c new file mode 100644 index 00000000..8981bcbd --- /dev/null +++ b/src/spdk/lib/trace/trace.c @@ -0,0 +1,168 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/string.h" +#include "spdk/trace.h" + +static int g_trace_fd = -1; +static char g_shm_name[64]; + +struct spdk_trace_histories *g_trace_histories; + +void +_spdk_trace_record(uint64_t tsc, uint16_t tpoint_id, uint16_t poller_id, uint32_t size, + uint64_t object_id, uint64_t arg1) +{ + struct spdk_trace_history *lcore_history; + struct spdk_trace_entry *next_entry; + unsigned lcore; + + lcore = spdk_env_get_current_core(); + if (lcore >= SPDK_TRACE_MAX_LCORE) { + return; + } + + lcore_history = &g_trace_histories->per_lcore_history[lcore]; + if (tsc == 0) { + tsc = spdk_get_ticks(); + } + + lcore_history->tpoint_count[tpoint_id]++; + + next_entry = &lcore_history->entries[lcore_history->next_entry]; + next_entry->tsc = tsc; + next_entry->tpoint_id = tpoint_id; + next_entry->poller_id = poller_id; + next_entry->size = size; + next_entry->object_id = object_id; + next_entry->arg1 = arg1; + + lcore_history->next_entry++; + if (lcore_history->next_entry == SPDK_TRACE_SIZE) { + lcore_history->next_entry = 0; + } +} + +int +spdk_trace_init(const char *shm_name) +{ + int i = 0; + + snprintf(g_shm_name, sizeof(g_shm_name), "%s", shm_name); + + g_trace_fd = shm_open(shm_name, O_RDWR | O_CREAT, 0600); + if (g_trace_fd == -1) { + fprintf(stderr, "could not shm_open spdk_trace\n"); + fprintf(stderr, "errno=%d %s\n", errno, spdk_strerror(errno)); + return 1; + } + + if (ftruncate(g_trace_fd, sizeof(*g_trace_histories)) != 0) { + fprintf(stderr, "could not truncate shm\n"); + goto trace_init_err; + } + + g_trace_histories = mmap(NULL, sizeof(*g_trace_histories), PROT_READ | PROT_WRITE, + MAP_SHARED, g_trace_fd, 0); + if (g_trace_histories == MAP_FAILED) { + fprintf(stderr, "could not mmap shm\n"); + goto trace_init_err; + } + + /* TODO: On FreeBSD, mlock on shm_open'd memory doesn't seem to work. Docs say that kern.ipc.shm_use_phys=1 + * should allow it, but forcing that doesn't seem to work either. So for now just skip mlock on FreeBSD + * altogether. + */ +#if defined(__linux__) + if (mlock(g_trace_histories, sizeof(*g_trace_histories)) != 0) { + fprintf(stderr, "Could not mlock shm for tracing - %s.\n", spdk_strerror(errno)); + if (errno == ENOMEM) { + fprintf(stderr, "Check /dev/shm for old tracing files that can be deleted.\n"); + } + goto trace_init_err; + } +#endif + + memset(g_trace_histories, 0, sizeof(*g_trace_histories)); + + g_trace_flags = &g_trace_histories->flags; + + g_trace_flags->tsc_rate = spdk_get_ticks_hz(); + + for (i = 0; i < SPDK_TRACE_MAX_LCORE; i++) { + g_trace_histories->per_lcore_history[i].lcore = i; + } + + spdk_trace_flags_init(); + + return 0; + +trace_init_err: + if (g_trace_histories != MAP_FAILED) { + munmap(g_trace_histories, sizeof(*g_trace_histories)); + } + close(g_trace_fd); + g_trace_fd = -1; + shm_unlink(shm_name); + g_trace_histories = NULL; + + return 1; + +} + +void +spdk_trace_cleanup(void) +{ + bool unlink; + + if (g_trace_histories == NULL) { + return; + } + + /* + * Only unlink the shm if there were no tracepoints enabled. This ensures the file + * can be used after this process exits/crashes for debugging. + * Note that we have to calculate this value before g_trace_histories gets unmapped. + */ + unlink = spdk_mem_all_zero(g_trace_flags->tpoint_mask, sizeof(g_trace_flags->tpoint_mask)); + munmap(g_trace_histories, sizeof(struct spdk_trace_histories)); + g_trace_histories = NULL; + close(g_trace_fd); + + if (unlink) { + shm_unlink(g_shm_name); + } +} diff --git a/src/spdk/lib/trace/trace_flags.c b/src/spdk/lib/trace/trace_flags.c new file mode 100644 index 00000000..69ca0bdf --- /dev/null +++ b/src/spdk/lib/trace/trace_flags.c @@ -0,0 +1,179 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/trace.h" +#include "spdk/log.h" + +struct spdk_trace_flags *g_trace_flags = NULL; +static struct spdk_trace_register_fn *g_reg_fn_head = NULL; + +uint64_t +spdk_trace_get_tpoint_mask(uint32_t group_id) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("%s: invalid group ID %d\n", __func__, group_id); + return 0ULL; + } + + return g_trace_flags->tpoint_mask[group_id]; +} + +void +spdk_trace_set_tpoints(uint32_t group_id, uint64_t tpoint_mask) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("%s: invalid group ID %d\n", __func__, group_id); + return; + } + + g_trace_flags->tpoint_mask[group_id] |= tpoint_mask; +} + +void +spdk_trace_clear_tpoints(uint32_t group_id, uint64_t tpoint_mask) +{ + if (group_id >= SPDK_TRACE_MAX_GROUP_ID) { + SPDK_ERRLOG("%s: invalid group ID %d\n", __func__, group_id); + return; + } + + g_trace_flags->tpoint_mask[group_id] &= ~tpoint_mask; +} + +uint64_t +spdk_trace_get_tpoint_group_mask(void) +{ + uint64_t mask = 0x0; + int i; + + for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) { + if (spdk_trace_get_tpoint_mask(i) != 0) { + mask |= (1ULL << i); + } + } + + return mask; +} + +void +spdk_trace_set_tpoint_group_mask(uint64_t tpoint_group_mask) +{ + int i; + + for (i = 0; i < SPDK_TRACE_MAX_GROUP_ID; i++) { + if (tpoint_group_mask & (1ULL << i)) { + spdk_trace_set_tpoints(i, -1ULL); + } + } +} + +void +spdk_trace_register_owner(uint8_t type, char id_prefix) +{ + struct spdk_trace_owner *owner; + + assert(type != OWNER_NONE); + + /* 'owner' has 256 entries and since 'type' is a uint8_t, it + * can't overrun the array. + */ + owner = &g_trace_flags->owner[type]; + assert(owner->type == 0); + + owner->type = type; + owner->id_prefix = id_prefix; +} + +void +spdk_trace_register_object(uint8_t type, char id_prefix) +{ + struct spdk_trace_object *object; + + assert(type != OBJECT_NONE); + + /* 'object' has 256 entries and since 'type' is a uint8_t, it + * can't overrun the array. + */ + object = &g_trace_flags->object[type]; + assert(object->type == 0); + + object->type = type; + object->id_prefix = id_prefix; +} + +void +spdk_trace_register_description(const char *name, const char *short_name, + uint16_t tpoint_id, uint8_t owner_type, + uint8_t object_type, uint8_t new_object, + uint8_t arg1_is_ptr, const char *arg1_name) +{ + struct spdk_trace_tpoint *tpoint; + + assert(tpoint_id != 0); + assert(tpoint_id < SPDK_TRACE_MAX_TPOINT_ID); + + tpoint = &g_trace_flags->tpoint[tpoint_id]; + assert(tpoint->tpoint_id == 0); + + snprintf(tpoint->name, sizeof(tpoint->name), "%s", name); + snprintf(tpoint->short_name, sizeof(tpoint->short_name), "%s", short_name); + tpoint->tpoint_id = tpoint_id; + tpoint->object_type = object_type; + tpoint->owner_type = owner_type; + tpoint->new_object = new_object; + tpoint->arg1_is_ptr = arg1_is_ptr; + snprintf(tpoint->arg1_name, sizeof(tpoint->arg1_name), "%s", arg1_name); +} + +void +spdk_trace_add_register_fn(struct spdk_trace_register_fn *reg_fn) +{ + reg_fn->next = g_reg_fn_head; + g_reg_fn_head = reg_fn; +} + + +void +spdk_trace_flags_init(void) +{ + struct spdk_trace_register_fn *reg_fn; + + reg_fn = g_reg_fn_head; + while (reg_fn) { + reg_fn->reg_fn(); + reg_fn = reg_fn->next; + } +} diff --git a/src/spdk/lib/ut_mock/Makefile b/src/spdk/lib/ut_mock/Makefile new file mode 100644 index 00000000..99584181 --- /dev/null +++ b/src/spdk/lib/ut_mock/Makefile @@ -0,0 +1,40 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = mock.c +LIBNAME = spdk_mock + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/ut_mock/mock.c b/src/spdk/lib/ut_mock/mock.c new file mode 100644 index 00000000..6d141b40 --- /dev/null +++ b/src/spdk/lib/ut_mock/mock.c @@ -0,0 +1,45 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk_internal/mock.h" + +DEFINE_WRAPPER(calloc, void *, (size_t nmemb, size_t size), (nmemb, size)) + +DEFINE_WRAPPER(pthread_mutex_init, int, + (pthread_mutex_t *mtx, const pthread_mutexattr_t *attr), + (mtx, attr)) + +DEFINE_WRAPPER(pthread_mutexattr_init, int, + (pthread_mutexattr_t *attr), (attr)) + +DEFINE_WRAPPER(pthread_self, pthread_t, (void), ()) diff --git a/src/spdk/lib/util/Makefile b/src/spdk/lib/util/Makefile new file mode 100644 index 00000000..c31a506b --- /dev/null +++ b/src/spdk/lib/util/Makefile @@ -0,0 +1,41 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = base64.c bit_array.c cpuset.c crc16.c crc32.c crc32c.c crc32_ieee.c fd.c strerror_tls.c string.c uuid.c +LIBNAME = util +LOCAL_SYS_LIBS = -luuid + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/util/base64.c b/src/spdk/lib/util/base64.c new file mode 100644 index 00000000..81361263 --- /dev/null +++ b/src/spdk/lib/util/base64.c @@ -0,0 +1,228 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/endian.h" +#include "spdk/base64.h" + +#define BASE64_ENC_BITMASK 0x3FUL +#define BASE64_PADDING_CHAR '=' + +static const char base64_enc_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +static const char base64_urfsafe_enc_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789-_"; + +static const uint8_t +base64_dec_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, + 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +static const uint8_t +base64_urlsafe_dec_table[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 255, 255, 255, + 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 63, + 255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, +}; + +static int +_spdk_base64_encode(char *dst, const char *enc_table, const void *src, size_t src_len) +{ + uint32_t raw_u32; + + if (!dst || !src || src_len <= 0) { + return -EINVAL; + } + + while (src_len >= 4) { + raw_u32 = from_be32(src); + + *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK]; + + src_len -= 3; + src += 3; + } + + if (src_len == 0) { + goto out; + } + + raw_u32 = 0; + memcpy(&raw_u32, src, src_len); + raw_u32 = from_be32(&raw_u32); + + *dst++ = enc_table[(raw_u32 >> 26) & BASE64_ENC_BITMASK]; + *dst++ = enc_table[(raw_u32 >> 20) & BASE64_ENC_BITMASK]; + *dst++ = (src_len >= 2) ? enc_table[(raw_u32 >> 14) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR; + *dst++ = (src_len == 3) ? enc_table[(raw_u32 >> 8) & BASE64_ENC_BITMASK] : BASE64_PADDING_CHAR; + +out: + *dst = '\0'; + + return 0; +} + +int +spdk_base64_encode(char *dst, const void *src, size_t src_len) +{ + return _spdk_base64_encode(dst, base64_enc_table, src, src_len); +} + +int +spdk_base64_urlsafe_encode(char *dst, const void *src, size_t src_len) +{ + return _spdk_base64_encode(dst, base64_urfsafe_enc_table, src, src_len); +} + +static int +_spdk_base64_decode(void *dst, size_t *_dst_len, const uint8_t *dec_table, const char *src) +{ + size_t src_strlen, dst_len; + size_t tail_len = 0; + const uint8_t *src_in; + uint32_t tmp[4]; + int i; + + if (!dst || !src) { + return -EINVAL; + } + + src_strlen = strlen(src); + + /* strlen of src should be 4n */ + if (src_strlen == 0 || src_strlen % 4 != 0) { + return -EINVAL; + } + + /* Consider Base64 padding, it at most has 2 padding characters. */ + for (i = 0; i < 2; i++) { + if (src[src_strlen - 1] != BASE64_PADDING_CHAR) { + break; + } + src_strlen--; + } + + /* strlen of src without padding shouldn't be 4n+1 */ + if (src_strlen == 0 || src_strlen % 4 == 1) { + return -EINVAL; + } + + dst_len = spdk_base64_get_decoded_len(src_strlen); + src_in = (const uint8_t *) src; + + /* space of dst can be used by to_be32 */ + while (src_strlen > 4) { + tmp[0] = dec_table[*src_in++]; + tmp[1] = dec_table[*src_in++]; + tmp[2] = dec_table[*src_in++]; + tmp[3] = dec_table[*src_in++]; + + if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) { + return -EINVAL; + } + + to_be32(dst, tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26); + + dst += 3; + src_strlen -= 4; + } + + /* space of dst is not enough to be used by to_be32 */ + tmp[0] = dec_table[src_in[0]]; + tmp[1] = dec_table[src_in[1]]; + tmp[2] = (src_strlen >= 3) ? dec_table[src_in[2]] : 0; + tmp[3] = (src_strlen == 4) ? dec_table[src_in[3]] : 0; + tail_len = src_strlen - 1; + + if (tmp[0] == 255 || tmp[1] == 255 || tmp[2] == 255 || tmp[3] == 255) { + return -EINVAL; + } + + to_be32(&tmp[3], tmp[3] << 8 | tmp[2] << 14 | tmp[1] << 20 | tmp[0] << 26); + memcpy(dst, (uint8_t *)&tmp[3], tail_len); + + /* Assign pointers */ + if (_dst_len) { + *_dst_len = dst_len; + } + + return 0; +} + +int +spdk_base64_decode(void *dst, size_t *dst_len, const char *src) +{ + return _spdk_base64_decode(dst, dst_len, base64_dec_table, src); +} + +int +spdk_base64_urlsafe_decode(void *dst, size_t *dst_len, const char *src) +{ + return _spdk_base64_decode(dst, dst_len, base64_urlsafe_dec_table, src); +} diff --git a/src/spdk/lib/util/bit_array.c b/src/spdk/lib/util/bit_array.c new file mode 100644 index 00000000..d6c112f7 --- /dev/null +++ b/src/spdk/lib/util/bit_array.c @@ -0,0 +1,313 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bit_array.h" +#include "spdk/env.h" + +#include "spdk/likely.h" +#include "spdk/util.h" + +typedef uint64_t spdk_bit_array_word; +#define SPDK_BIT_ARRAY_WORD_TZCNT(x) (__builtin_ctzll(x)) +#define SPDK_BIT_ARRAY_WORD_POPCNT(x) (__builtin_popcountll(x)) +#define SPDK_BIT_ARRAY_WORD_C(x) ((spdk_bit_array_word)(x)) +#define SPDK_BIT_ARRAY_WORD_BYTES sizeof(spdk_bit_array_word) +#define SPDK_BIT_ARRAY_WORD_BITS (SPDK_BIT_ARRAY_WORD_BYTES * 8) +#define SPDK_BIT_ARRAY_WORD_INDEX_SHIFT spdk_u32log2(SPDK_BIT_ARRAY_WORD_BITS) +#define SPDK_BIT_ARRAY_WORD_INDEX_MASK ((1u << SPDK_BIT_ARRAY_WORD_INDEX_SHIFT) - 1) + +struct spdk_bit_array { + uint32_t bit_count; + spdk_bit_array_word words[]; +}; + +struct spdk_bit_array * +spdk_bit_array_create(uint32_t num_bits) +{ + struct spdk_bit_array *ba = NULL; + + spdk_bit_array_resize(&ba, num_bits); + + return ba; +} + +void +spdk_bit_array_free(struct spdk_bit_array **bap) +{ + struct spdk_bit_array *ba; + + if (!bap) { + return; + } + + ba = *bap; + *bap = NULL; + spdk_dma_free(ba); +} + +static inline uint32_t +spdk_bit_array_word_count(uint32_t num_bits) +{ + return (num_bits + SPDK_BIT_ARRAY_WORD_BITS - 1) >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; +} + +static inline spdk_bit_array_word +spdk_bit_array_word_mask(uint32_t num_bits) +{ + assert(num_bits < SPDK_BIT_ARRAY_WORD_BITS); + return (SPDK_BIT_ARRAY_WORD_C(1) << num_bits) - 1; +} + +int +spdk_bit_array_resize(struct spdk_bit_array **bap, uint32_t num_bits) +{ + struct spdk_bit_array *new_ba; + uint32_t old_word_count, new_word_count; + size_t new_size; + + /* + * Max number of bits allowed is UINT32_MAX - 1, because we use UINT32_MAX to denote + * when a set or cleared bit cannot be found. + */ + if (!bap || num_bits == UINT32_MAX) { + return -EINVAL; + } + + new_word_count = spdk_bit_array_word_count(num_bits); + new_size = offsetof(struct spdk_bit_array, words) + new_word_count * SPDK_BIT_ARRAY_WORD_BYTES; + + /* + * Always keep one extra word with a 0 and a 1 past the actual required size so that the + * find_first functions can just keep going until they match. + */ + new_size += SPDK_BIT_ARRAY_WORD_BYTES; + + new_ba = (struct spdk_bit_array *)spdk_dma_realloc(*bap, new_size, 64, NULL); + if (!new_ba) { + return -ENOMEM; + } + + /* + * Set up special extra word (see above comment about find_first_clear). + * + * This is set to 0b10 so that find_first_clear will find a 0 at the very first + * bit past the end of the buffer, and find_first_set will find a 1 at the next bit + * past that. + */ + new_ba->words[new_word_count] = 0x2; + + if (*bap == NULL) { + old_word_count = 0; + new_ba->bit_count = 0; + } else { + old_word_count = spdk_bit_array_word_count(new_ba->bit_count); + } + + if (new_word_count > old_word_count) { + /* Zero out new entries */ + memset(&new_ba->words[old_word_count], 0, + (new_word_count - old_word_count) * SPDK_BIT_ARRAY_WORD_BYTES); + } else if (new_word_count == old_word_count && num_bits < new_ba->bit_count) { + /* Make sure any existing partial last word is cleared beyond the new num_bits. */ + uint32_t last_word_bits; + spdk_bit_array_word mask; + + last_word_bits = num_bits & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + mask = spdk_bit_array_word_mask(last_word_bits); + new_ba->words[old_word_count - 1] &= mask; + } + + new_ba->bit_count = num_bits; + *bap = new_ba; + return 0; +} + +uint32_t +spdk_bit_array_capacity(const struct spdk_bit_array *ba) +{ + return ba->bit_count; +} + +static inline int +_spdk_bit_array_get_word(const struct spdk_bit_array *ba, uint32_t bit_index, + uint32_t *word_index, uint32_t *word_bit_index) +{ + if (spdk_unlikely(bit_index >= ba->bit_count)) { + return -EINVAL; + } + + *word_index = bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; + *word_bit_index = bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + + return 0; +} + +bool +spdk_bit_array_get(const struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (_spdk_bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + return false; + } + + return (ba->words[word_index] >> word_bit_index) & 1U; +} + +int +spdk_bit_array_set(struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (_spdk_bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + return -EINVAL; + } + + ba->words[word_index] |= (SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index); + return 0; +} + +void +spdk_bit_array_clear(struct spdk_bit_array *ba, uint32_t bit_index) +{ + uint32_t word_index, word_bit_index; + + if (_spdk_bit_array_get_word(ba, bit_index, &word_index, &word_bit_index)) { + /* + * Clearing past the end of the bit array is a no-op, since bit past the end + * are implicitly 0. + */ + return; + } + + ba->words[word_index] &= ~(SPDK_BIT_ARRAY_WORD_C(1) << word_bit_index); +} + +static inline uint32_t +_spdk_bit_array_find_first(const struct spdk_bit_array *ba, uint32_t start_bit_index, + spdk_bit_array_word xor_mask) +{ + uint32_t word_index, first_word_bit_index; + spdk_bit_array_word word, first_word_mask; + const spdk_bit_array_word *words, *cur_word; + + if (spdk_unlikely(start_bit_index >= ba->bit_count)) { + return ba->bit_count; + } + + word_index = start_bit_index >> SPDK_BIT_ARRAY_WORD_INDEX_SHIFT; + words = ba->words; + cur_word = &words[word_index]; + + /* + * Special case for first word: skip start_bit_index % SPDK_BIT_ARRAY_WORD_BITS bits + * within the first word. + */ + first_word_bit_index = start_bit_index & SPDK_BIT_ARRAY_WORD_INDEX_MASK; + first_word_mask = spdk_bit_array_word_mask(first_word_bit_index); + + word = (*cur_word ^ xor_mask) & ~first_word_mask; + + /* + * spdk_bit_array_resize() guarantees that an extra word with a 1 and a 0 will always be + * at the end of the words[] array, so just keep going until a word matches. + */ + while (word == 0) { + word = *++cur_word ^ xor_mask; + } + + return ((uintptr_t)cur_word - (uintptr_t)words) * 8 + SPDK_BIT_ARRAY_WORD_TZCNT(word); +} + + +uint32_t +spdk_bit_array_find_first_set(const struct spdk_bit_array *ba, uint32_t start_bit_index) +{ + uint32_t bit_index; + + bit_index = _spdk_bit_array_find_first(ba, start_bit_index, 0); + + /* + * If we ran off the end of the array and found the 1 bit in the extra word, + * return UINT32_MAX to indicate no actual 1 bits were found. + */ + if (bit_index >= ba->bit_count) { + bit_index = UINT32_MAX; + } + + return bit_index; +} + +uint32_t +spdk_bit_array_find_first_clear(const struct spdk_bit_array *ba, uint32_t start_bit_index) +{ + uint32_t bit_index; + + bit_index = _spdk_bit_array_find_first(ba, start_bit_index, SPDK_BIT_ARRAY_WORD_C(-1)); + + /* + * If we ran off the end of the array and found the 0 bit in the extra word, + * return UINT32_MAX to indicate no actual 0 bits were found. + */ + if (bit_index >= ba->bit_count) { + bit_index = UINT32_MAX; + } + + return bit_index; +} + +uint32_t +spdk_bit_array_count_set(const struct spdk_bit_array *ba) +{ + const spdk_bit_array_word *cur_word = ba->words; + uint32_t word_count = spdk_bit_array_word_count(ba->bit_count); + uint32_t set_count = 0; + + while (word_count--) { + /* + * No special treatment is needed for the last (potentially partial) word, since + * spdk_bit_array_resize() makes sure the bits past bit_count are cleared. + */ + set_count += SPDK_BIT_ARRAY_WORD_POPCNT(*cur_word++); + } + + return set_count; +} + +uint32_t +spdk_bit_array_count_clear(const struct spdk_bit_array *ba) +{ + return ba->bit_count - spdk_bit_array_count_set(ba); +} diff --git a/src/spdk/lib/util/cpuset.c b/src/spdk/lib/util/cpuset.c new file mode 100644 index 00000000..1a02e59f --- /dev/null +++ b/src/spdk/lib/util/cpuset.c @@ -0,0 +1,320 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/cpuset.h" +#include "spdk/log.h" + +struct spdk_cpuset { + char str[SPDK_CPUSET_SIZE / 4]; + uint8_t cpus[SPDK_CPUSET_SIZE / 8]; +}; + +struct spdk_cpuset * +spdk_cpuset_alloc(void) +{ + return (struct spdk_cpuset *)calloc(sizeof(struct spdk_cpuset), 1); +} + +void +spdk_cpuset_free(struct spdk_cpuset *set) +{ + free(set); +} + +bool +spdk_cpuset_equal(const struct spdk_cpuset *set1, const struct spdk_cpuset *set2) +{ + assert(set1 != NULL); + assert(set2 != NULL); + return memcmp(set1->cpus, set2->cpus, sizeof(set2->cpus)) == 0; +} + +void +spdk_cpuset_copy(struct spdk_cpuset *set1, const struct spdk_cpuset *set2) +{ + assert(set1 != NULL); + assert(set2 != NULL); + memcpy(&set1->cpus, &set2->cpus, sizeof(set2->cpus)); +} + +void +spdk_cpuset_and(struct spdk_cpuset *set1, const struct spdk_cpuset *set2) +{ + unsigned int i; + assert(set1 != NULL); + assert(set2 != NULL); + for (i = 0; i < sizeof(set2->cpus); i++) { + set1->cpus[i] &= set2->cpus[i]; + } +} + +void +spdk_cpuset_or(struct spdk_cpuset *set1, const struct spdk_cpuset *set2) +{ + unsigned int i; + assert(set1 != NULL); + assert(set2 != NULL); + for (i = 0; i < sizeof(set2->cpus); i++) { + set1->cpus[i] |= set2->cpus[i]; + } +} + +void +spdk_cpuset_zero(struct spdk_cpuset *set) +{ + assert(set != NULL); + memset(set->cpus, 0, sizeof(set->cpus)); +} + +void +spdk_cpuset_set_cpu(struct spdk_cpuset *set, uint32_t cpu, bool state) +{ + assert(set != NULL); + assert(cpu < sizeof(set->cpus) * 8); + if (state) { + set->cpus[cpu / 8] |= (1U << (cpu % 8)); + } else { + set->cpus[cpu / 8] &= ~(1U << (cpu % 8)); + } +} + +bool +spdk_cpuset_get_cpu(const struct spdk_cpuset *set, uint32_t cpu) +{ + assert(set != NULL); + assert(cpu < sizeof(set->cpus) * 8); + return (set->cpus[cpu / 8] >> (cpu % 8)) & 1U; +} + +uint32_t +spdk_cpuset_count(const struct spdk_cpuset *set) +{ + uint32_t count = 0; + uint8_t n; + unsigned int i; + for (i = 0; i < sizeof(set->cpus); i++) { + n = set->cpus[i]; + while (n) { + n &= (n - 1); + count++; + } + } + return count; +} + +const char * +spdk_cpuset_fmt(struct spdk_cpuset *set) +{ + uint32_t lcore, lcore_max = 0; + int val, i, n; + char *ptr; + static const char *hex = "0123456789abcdef"; + + assert(set != NULL); + + for (lcore = 0; lcore < sizeof(set->cpus) * 8; lcore++) { + if (spdk_cpuset_get_cpu(set, lcore)) { + lcore_max = lcore; + } + } + + ptr = set->str; + n = lcore_max / 8; + val = set->cpus[n]; + + /* Store first number only if it is not leading zero */ + if ((val & 0xf0) != 0) { + *(ptr++) = hex[(val & 0xf0) >> 4]; + } + *(ptr++) = hex[val & 0x0f]; + + for (i = n - 1; i >= 0; i--) { + val = set->cpus[i]; + *(ptr++) = hex[(val & 0xf0) >> 4]; + *(ptr++) = hex[val & 0x0f]; + } + *ptr = '\0'; + + return set->str; +} + +static int +hex_value(uint8_t c) +{ +#define V(x, y) [x] = y + 1 + static const int8_t val[256] = { + V('0', 0), V('1', 1), V('2', 2), V('3', 3), V('4', 4), + V('5', 5), V('6', 6), V('7', 7), V('8', 8), V('9', 9), + V('A', 0xA), V('B', 0xB), V('C', 0xC), V('D', 0xD), V('E', 0xE), V('F', 0xF), + V('a', 0xA), V('b', 0xB), V('c', 0xC), V('d', 0xD), V('e', 0xE), V('f', 0xF), + }; +#undef V + + return val[c] - 1; +} + +static int +parse_list(const char *mask, struct spdk_cpuset *set) +{ + char *end; + const char *ptr = mask; + uint32_t lcore; + uint32_t lcore_min, lcore_max; + + spdk_cpuset_zero(set); + lcore_min = UINT32_MAX; + + ptr++; + end = (char *)ptr; + do { + while (isblank(*ptr)) { + ptr++; + } + if (*ptr == '\0' || *ptr == ']' || *ptr == '-' || *ptr == ',') { + goto invalid_character; + } + + errno = 0; + lcore = strtoul(ptr, &end, 10); + if (errno) { + SPDK_ERRLOG("Conversion of core mask in '%s' failed\n", mask); + return -1; + } + + if (lcore >= sizeof(set->cpus) * 8) { + SPDK_ERRLOG("Core number %" PRIu32 " is out of range in '%s'\n", lcore, mask); + return -1; + } + + while (isblank(*end)) { + end++; + } + + if (*end == '-') { + lcore_min = lcore; + } else if (*end == ',' || *end == ']') { + lcore_max = lcore; + if (lcore_min == UINT32_MAX) { + lcore_min = lcore; + } + if (lcore_min > lcore_max) { + SPDK_ERRLOG("Invalid range of CPUs (%" PRIu32 " > %" PRIu32 ")\n", + lcore_min, lcore_max); + return -1; + } + for (lcore = lcore_min; lcore <= lcore_max; lcore++) { + spdk_cpuset_set_cpu(set, lcore, true); + } + lcore_min = UINT32_MAX; + } else { + goto invalid_character; + } + + ptr = end + 1; + + } while (*end != ']'); + + return 0; + +invalid_character: + if (*end == '\0') { + SPDK_ERRLOG("Unexpected end of core list '%s'\n", mask); + } else { + SPDK_ERRLOG("Parsing of core list '%s' failed on character '%c'\n", mask, *end); + } + return -1; +} + +static int +parse_mask(const char *mask, struct spdk_cpuset *set, size_t len) +{ + int i, j; + char c; + int val; + uint32_t lcore = 0; + + if (mask[0] == '0' && (mask[1] == 'x' || mask[1] == 'X')) { + mask += 2; + len -= 2; + } + + spdk_cpuset_zero(set); + for (i = len - 1; i >= 0; i--) { + c = mask[i]; + val = hex_value(c); + if (val < 0) { + /* Invalid character */ + SPDK_ERRLOG("Invalid character in core mask '%s' (%c)\n", mask, c); + return -1; + } + for (j = 0; j < 4 && lcore < sizeof(set->cpus); j++, lcore++) { + if ((1 << j) & val) { + spdk_cpuset_set_cpu(set, lcore, true); + } + } + } + + return 0; +} + +int +spdk_cpuset_parse(struct spdk_cpuset *set, const char *mask) +{ + int ret; + size_t len; + + if (mask == NULL || set == NULL) { + return -1; + } + + while (isblank(*mask)) { + mask++; + } + + len = strlen(mask); + while (len > 0 && isblank(mask[len - 1])) { + len--; + } + + if (len == 0) { + return -1; + } + + if (mask[0] == '[') { + ret = parse_list(mask, set); + } else { + ret = parse_mask(mask, set, len); + } + + return ret; +} diff --git a/src/spdk/lib/util/crc16.c b/src/spdk/lib/util/crc16.c new file mode 100644 index 00000000..491c9058 --- /dev/null +++ b/src/spdk/lib/util/crc16.c @@ -0,0 +1,53 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc16.h" + +uint16_t +spdk_crc16_t10dif(const void *buf, size_t len) +{ + uint32_t j, rem = 0; + const uint8_t *data = (const uint8_t *)buf; + size_t i; + + uint16_t poly = SPDK_T10DIF_CRC16_POLYNOMIAL; + + for (i = 0; i < len; i++) { + rem = rem ^ (data[i] << 8); + for (j = 0; j < 8; j++) { + rem = rem << 1; + rem = (rem & 0x10000) ? rem ^ poly : rem; + } + } + return (uint16_t)rem; +} diff --git a/src/spdk/lib/util/crc32.c b/src/spdk/lib/util/crc32.c new file mode 100644 index 00000000..dfef9c54 --- /dev/null +++ b/src/spdk/lib/util/crc32.c @@ -0,0 +1,66 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc32.h" + +void +spdk_crc32_table_init(struct spdk_crc32_table *table, uint32_t polynomial_reflect) +{ + int i, j; + uint32_t val; + + for (i = 0; i < 256; i++) { + val = i; + for (j = 0; j < 8; j++) { + if (val & 1) { + val = (val >> 1) ^ polynomial_reflect; + } else { + val = (val >> 1); + } + } + table->table[i] = val; + } +} + +uint32_t +spdk_crc32_update(const struct spdk_crc32_table *table, const void *buf, size_t len, uint32_t crc) +{ + const uint8_t *buf_u8 = buf; + size_t i; + + for (i = 0; i < len; i++) { + crc = (crc >> 8) ^ table->table[(crc ^ buf_u8[i]) & 0xff]; + } + + return crc; +} diff --git a/src/spdk/lib/util/crc32_ieee.c b/src/spdk/lib/util/crc32_ieee.c new file mode 100644 index 00000000..2956e3fc --- /dev/null +++ b/src/spdk/lib/util/crc32_ieee.c @@ -0,0 +1,48 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc32.h" + +static struct spdk_crc32_table g_crc32_ieee_table; + +__attribute__((constructor)) static void +spdk_crc32_ieee_init(void) +{ + spdk_crc32_table_init(&g_crc32_ieee_table, SPDK_CRC32_POLYNOMIAL_REFLECT); +} + +uint32_t +spdk_crc32_ieee_update(const void *buf, size_t len, uint32_t crc) +{ + return spdk_crc32_update(&g_crc32_ieee_table, buf, len, crc); +} diff --git a/src/spdk/lib/util/crc32c.c b/src/spdk/lib/util/crc32c.c new file mode 100644 index 00000000..e95283b3 --- /dev/null +++ b/src/spdk/lib/util/crc32c.c @@ -0,0 +1,89 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/crc32.h" + +#if defined(__x86_64__) && defined(__SSE4_2__) +#include + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + uint64_t crc_tmp64; + size_t count; + + /* _mm_crc32_u64() needs a 64-bit intermediate value */ + crc_tmp64 = crc; + + /* Process as much of the buffer as possible in 64-bit blocks. */ + count = len / 8; + while (count--) { + uint64_t block; + + /* + * Use memcpy() to avoid unaligned loads, which are undefined behavior in C. + * The compiler will optimize out the memcpy() in release builds. + */ + memcpy(&block, buf, sizeof(block)); + crc_tmp64 = _mm_crc32_u64(crc_tmp64, block); + buf += sizeof(block); + } + crc = (uint32_t)crc_tmp64; + + /* Handle any trailing bytes. */ + count = len & 7; + while (count--) { + crc = _mm_crc32_u8(crc, *(const uint8_t *)buf); + buf++; + } + + return crc; +} + +#else /* SSE 4.2 (CRC32 instruction) not available */ + +static struct spdk_crc32_table g_crc32c_table; + +__attribute__((constructor)) static void +spdk_crc32c_init(void) +{ + spdk_crc32_table_init(&g_crc32c_table, SPDK_CRC32C_POLYNOMIAL_REFLECT); +} + +uint32_t +spdk_crc32c_update(const void *buf, size_t len, uint32_t crc) +{ + return spdk_crc32_update(&g_crc32c_table, buf, len, crc); +} + +#endif diff --git a/src/spdk/lib/util/fd.c b/src/spdk/lib/util/fd.c new file mode 100644 index 00000000..6b0d0d55 --- /dev/null +++ b/src/spdk/lib/util/fd.c @@ -0,0 +1,103 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/fd.h" + +#ifdef __linux__ +#include +#endif + +static uint64_t +dev_get_size(int fd) +{ +#if defined(DIOCGMEDIASIZE) /* FreeBSD */ + off_t size; + + if (ioctl(fd, DIOCGMEDIASIZE, &size) == 0) { + return size; + } +#elif defined(__linux__) && defined(BLKGETSIZE64) + uint64_t size; + + if (ioctl(fd, BLKGETSIZE64, &size) == 0) { + return size; + } +#endif + + return 0; +} + +uint32_t +spdk_fd_get_blocklen(int fd) +{ +#if defined(DKIOCGETBLOCKSIZE) /* FreeBSD */ + uint32_t blocklen; + + if (ioctl(fd, DKIOCGETBLOCKSIZE, &blocklen) == 0) { + return blocklen; + } +#elif defined(__linux__) && defined(BLKSSZGET) + uint32_t blocklen; + + if (ioctl(fd, BLKSSZGET, &blocklen) == 0) { + return blocklen; + } +#endif + + return 0; +} + +uint64_t +spdk_fd_get_size(int fd) +{ + struct stat st; + + if (fstat(fd, &st) != 0) { + return 0; + } + + if (S_ISLNK(st.st_mode)) { + return 0; + } + + if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) { + return dev_get_size(fd); + } else if (S_ISREG(st.st_mode)) { + return st.st_size; + } + + /* Not REG, CHR or BLK */ + return 0; +} diff --git a/src/spdk/lib/util/strerror_tls.c b/src/spdk/lib/util/strerror_tls.c new file mode 100644 index 00000000..c9dc8f13 --- /dev/null +++ b/src/spdk/lib/util/strerror_tls.c @@ -0,0 +1,43 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/string.h" + +static __thread char strerror_message[64]; + +const char * +spdk_strerror(int errnum) +{ + spdk_strerror_r(errnum, strerror_message, sizeof(strerror_message)); + return strerror_message; +} diff --git a/src/spdk/lib/util/string.c b/src/spdk/lib/util/string.c new file mode 100644 index 00000000..455aa20f --- /dev/null +++ b/src/spdk/lib/util/string.c @@ -0,0 +1,405 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/string.h" + +char * +spdk_vsprintf_alloc(const char *format, va_list args) +{ + va_list args_copy; + char *buf; + size_t bufsize; + int rc; + + /* Try with a small buffer first. */ + bufsize = 32; + + /* Limit maximum buffer size to something reasonable so we don't loop forever. */ + while (bufsize <= 1024 * 1024) { + buf = malloc(bufsize); + if (buf == NULL) { + return NULL; + } + + va_copy(args_copy, args); + rc = vsnprintf(buf, bufsize, format, args_copy); + va_end(args_copy); + + /* + * If vsnprintf() returned a count within our current buffer size, we are done. + * The count does not include the \0 terminator, so rc == bufsize is not OK. + */ + if (rc >= 0 && (size_t)rc < bufsize) { + return buf; + } + + /* + * vsnprintf() should return the required space, but some libc versions do not + * implement this correctly, so just double the buffer size and try again. + * + * We don't need the data in buf, so rather than realloc(), use free() and malloc() + * again to avoid a copy. + */ + free(buf); + bufsize *= 2; + } + + return NULL; +} + +char * +spdk_sprintf_alloc(const char *format, ...) +{ + va_list args; + char *ret; + + va_start(args, format); + ret = spdk_vsprintf_alloc(format, args); + va_end(args); + + return ret; +} + +char * +spdk_strlwr(char *s) +{ + char *p; + + if (s == NULL) { + return NULL; + } + + p = s; + while (*p != '\0') { + *p = tolower(*p); + p++; + } + + return s; +} + +char * +spdk_strsepq(char **stringp, const char *delim) +{ + char *p, *q, *r; + int quoted = 0, bslash = 0; + + p = *stringp; + if (p == NULL) { + return NULL; + } + + r = q = p; + while (*q != '\0' && *q != '\n') { + /* eat quoted characters */ + if (bslash) { + bslash = 0; + *r++ = *q++; + continue; + } else if (quoted) { + if (quoted == '"' && *q == '\\') { + bslash = 1; + q++; + continue; + } else if (*q == quoted) { + quoted = 0; + q++; + continue; + } + *r++ = *q++; + continue; + } else if (*q == '\\') { + bslash = 1; + q++; + continue; + } else if (*q == '"' || *q == '\'') { + quoted = *q; + q++; + continue; + } + + /* separator? */ + if (strchr(delim, *q) == NULL) { + *r++ = *q++; + continue; + } + + /* new string */ + q++; + break; + } + *r = '\0'; + + /* skip tailer */ + while (*q != '\0' && strchr(delim, *q) != NULL) { + q++; + } + if (*q != '\0') { + *stringp = q; + } else { + *stringp = NULL; + } + + return p; +} + +char * +spdk_str_trim(char *s) +{ + char *p, *q; + + if (s == NULL) { + return NULL; + } + + /* remove header */ + p = s; + while (*p != '\0' && isspace(*p)) { + p++; + } + + /* remove tailer */ + q = p + strlen(p); + while (q - 1 >= p && isspace(*(q - 1))) { + q--; + *q = '\0'; + } + + /* if remove header, move */ + if (p != s) { + q = s; + while (*p != '\0') { + *q++ = *p++; + } + *q = '\0'; + } + + return s; +} + +void +spdk_strcpy_pad(void *dst, const char *src, size_t size, int pad) +{ + size_t len; + + len = strlen(src); + if (len < size) { + memcpy(dst, src, len); + memset((char *)dst + len, pad, size - len); + } else { + memcpy(dst, src, size); + } +} + +size_t +spdk_strlen_pad(const void *str, size_t size, int pad) +{ + const uint8_t *start; + const uint8_t *iter; + uint8_t pad_byte; + + pad_byte = (uint8_t)pad; + start = (const uint8_t *)str; + + if (size == 0) { + return 0; + } + + iter = start + size - 1; + while (1) { + if (*iter != pad_byte) { + return iter - start + 1; + } + + if (iter == start) { + /* Hit the start of the string finding only pad_byte. */ + return 0; + } + iter--; + } +} + +int +spdk_parse_ip_addr(char *ip, char **host, char **port) +{ + char *p; + + if (ip == NULL) { + return -EINVAL; + } + + *host = NULL; + *port = NULL; + + if (ip[0] == '[') { + /* IPv6 */ + p = strchr(ip, ']'); + if (p == NULL) { + return -EINVAL; + } + *host = &ip[1]; + *p = '\0'; + + p++; + if (*p == '\0') { + return 0; + } else if (*p != ':') { + return -EINVAL; + } + + p++; + if (*p == '\0') { + return 0; + } + + *port = p; + } else { + /* IPv4 */ + p = strchr(ip, ':'); + if (p == NULL) { + *host = ip; + return 0; + } + + *host = ip; + *p = '\0'; + + p++; + if (*p == '\0') { + return 0; + } + + *port = p; + } + + return 0; +} + +size_t +spdk_str_chomp(char *s) +{ + size_t len = strlen(s); + size_t removed = 0; + + while (len > 0) { + if (s[len - 1] != '\r' && s[len - 1] != '\n') { + break; + } + + s[len - 1] = '\0'; + len--; + removed++; + } + + return removed; +} + +void +spdk_strerror_r(int errnum, char *buf, size_t buflen) +{ + int rc; + +#if defined(__USE_GNU) + char *new_buffer; + new_buffer = strerror_r(errnum, buf, buflen); + if (new_buffer != NULL) { + snprintf(buf, buflen, "%s", new_buffer); + rc = 0; + } else { + rc = 1; + } +#else + rc = strerror_r(errnum, buf, buflen); +#endif + + if (rc != 0) { + snprintf(buf, buflen, "Unknown error %d", errnum); + } +} + +int +spdk_parse_capacity(const char *cap_str, uint64_t *cap, bool *has_prefix) +{ + int rc; + char bin_prefix; + + rc = sscanf(cap_str, "%"SCNu64"%c", cap, &bin_prefix); + if (rc == 1) { + *has_prefix = false; + return 0; + } else if (rc == 0) { + if (errno == 0) { + /* No scanf matches - the string does not start with a digit */ + return -EINVAL; + } else { + /* Parsing error */ + return -errno; + } + } + + *has_prefix = true; + switch (bin_prefix) { + case 'k': + case 'K': + *cap *= 1024; + break; + case 'm': + case 'M': + *cap *= 1024 * 1024; + break; + case 'g': + case 'G': + *cap *= 1024 * 1024 * 1024; + break; + default: + return -EINVAL; + } + + return 0; +} + +bool +spdk_mem_all_zero(const void *data, size_t size) +{ + const uint8_t *buf = data; + + while (size--) { + if (*buf++ != 0) { + return false; + } + } + + return true; +} diff --git a/src/spdk/lib/util/uuid.c b/src/spdk/lib/util/uuid.c new file mode 100644 index 00000000..1af7368f --- /dev/null +++ b/src/spdk/lib/util/uuid.c @@ -0,0 +1,67 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/uuid.h" + +#include + +SPDK_STATIC_ASSERT(sizeof(struct spdk_uuid) == sizeof(uuid_t), "Size mismatch"); + +int +spdk_uuid_parse(struct spdk_uuid *uuid, const char *uuid_str) +{ + return uuid_parse(uuid_str, (void *)uuid) == 0 ? 0 : -EINVAL; +} + +int +spdk_uuid_fmt_lower(char *uuid_str, size_t uuid_str_size, const struct spdk_uuid *uuid) +{ + if (uuid_str_size < SPDK_UUID_STRING_LEN) { + return -EINVAL; + } + + uuid_unparse_lower((void *)uuid, uuid_str); + return 0; +} + +int +spdk_uuid_compare(const struct spdk_uuid *u1, const struct spdk_uuid *u2) +{ + return uuid_compare((void *)u1, (void *)u2); +} + +void +spdk_uuid_generate(struct spdk_uuid *uuid) +{ + uuid_generate((void *)uuid); +} diff --git a/src/spdk/lib/vhost/Makefile b/src/spdk/lib/vhost/Makefile new file mode 100644 index 00000000..b46978e2 --- /dev/null +++ b/src/spdk/lib/vhost/Makefile @@ -0,0 +1,47 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I. +CFLAGS += -Irte_vhost +CFLAGS += $(ENV_CFLAGS) + +C_SRCS = vhost.c vhost_rpc.c vhost_scsi.c vhost_blk.c vhost_nvme.c + +LIBNAME = vhost + +DIRS-y += rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost/Makefile b/src/spdk/lib/vhost/rte_vhost/Makefile new file mode 100644 index 00000000..b0ae6335 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/Makefile @@ -0,0 +1,46 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += -I. +CFLAGS += $(ENV_CFLAGS) +CFLAGS += -include rte_config.h + +# These are the DPDK vhost files copied (for now) into SPDK +C_SRCS += fd_man.c socket.c vhost_user.c vhost.c + +LIBNAME = rte_vhost + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.c b/src/spdk/lib/vhost/rte_vhost/fd_man.c new file mode 100644 index 00000000..2ceacc9a --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/fd_man.c @@ -0,0 +1,300 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "fd_man.h" + +#define FDPOLLERR (POLLERR | POLLHUP | POLLNVAL) + +static int +get_last_valid_idx(struct fdset *pfdset, int last_valid_idx) +{ + int i; + + for (i = last_valid_idx; i >= 0 && pfdset->fd[i].fd == -1; i--) + ; + + return i; +} + +static void +fdset_move(struct fdset *pfdset, int dst, int src) +{ + pfdset->fd[dst] = pfdset->fd[src]; + pfdset->rwfds[dst] = pfdset->rwfds[src]; +} + +static void +fdset_shrink_nolock(struct fdset *pfdset) +{ + int i; + int last_valid_idx = get_last_valid_idx(pfdset, pfdset->num - 1); + + for (i = 0; i < last_valid_idx; i++) { + if (pfdset->fd[i].fd != -1) + continue; + + fdset_move(pfdset, i, last_valid_idx); + last_valid_idx = get_last_valid_idx(pfdset, last_valid_idx - 1); + } + pfdset->num = last_valid_idx + 1; +} + +/* + * Find deleted fd entries and remove them + */ +static void +fdset_shrink(struct fdset *pfdset) +{ + pthread_mutex_lock(&pfdset->fd_mutex); + fdset_shrink_nolock(pfdset); + pthread_mutex_unlock(&pfdset->fd_mutex); +} + +/** + * Returns the index in the fdset for a given fd. + * @return + * index for the fd, or -1 if fd isn't in the fdset. + */ +static int +fdset_find_fd(struct fdset *pfdset, int fd) +{ + int i; + + for (i = 0; i < pfdset->num && pfdset->fd[i].fd != fd; i++) + ; + + return i == pfdset->num ? -1 : i; +} + +static void +fdset_add_fd(struct fdset *pfdset, int idx, int fd, + fd_cb rcb, fd_cb wcb, void *dat) +{ + struct fdentry *pfdentry = &pfdset->fd[idx]; + struct pollfd *pfd = &pfdset->rwfds[idx]; + + pfdentry->fd = fd; + pfdentry->rcb = rcb; + pfdentry->wcb = wcb; + pfdentry->dat = dat; + + pfd->fd = fd; + pfd->events = rcb ? POLLIN : 0; + pfd->events |= wcb ? POLLOUT : 0; + pfd->revents = 0; +} + +void +fdset_init(struct fdset *pfdset) +{ + int i; + + if (pfdset == NULL) + return; + + for (i = 0; i < MAX_FDS; i++) { + pfdset->fd[i].fd = -1; + pfdset->fd[i].dat = NULL; + } + pfdset->num = 0; +} + +/** + * Register the fd in the fdset with read/write handler and context. + */ +int +fdset_add(struct fdset *pfdset, int fd, fd_cb rcb, fd_cb wcb, void *dat) +{ + int i; + + if (pfdset == NULL || fd == -1) + return -1; + + pthread_mutex_lock(&pfdset->fd_mutex); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + fdset_shrink_nolock(pfdset); + i = pfdset->num < MAX_FDS ? pfdset->num++ : -1; + if (i == -1) { + pthread_mutex_unlock(&pfdset->fd_mutex); + return -2; + } + } + + fdset_add_fd(pfdset, i, fd, rcb, wcb, dat); + pthread_mutex_unlock(&pfdset->fd_mutex); + + return 0; +} + +/** + * Unregister the fd from the fdset. + * Returns context of a given fd or NULL. + */ +void * +fdset_del(struct fdset *pfdset, int fd) +{ + int i; + void *dat = NULL; + + if (pfdset == NULL || fd == -1) + return NULL; + + do { + pthread_mutex_lock(&pfdset->fd_mutex); + + i = fdset_find_fd(pfdset, fd); + if (i != -1 && pfdset->fd[i].busy == 0) { + /* busy indicates r/wcb is executing! */ + dat = pfdset->fd[i].dat; + pfdset->fd[i].fd = -1; + pfdset->fd[i].rcb = pfdset->fd[i].wcb = NULL; + pfdset->fd[i].dat = NULL; + i = -1; + } + pthread_mutex_unlock(&pfdset->fd_mutex); + } while (i != -1); + + return dat; +} + + +/** + * This functions runs in infinite blocking loop until there is no fd in + * pfdset. It calls corresponding r/w handler if there is event on the fd. + * + * Before the callback is called, we set the flag to busy status; If other + * thread(now rte_vhost_driver_unregister) calls fdset_del concurrently, it + * will wait until the flag is reset to zero(which indicates the callback is + * finished), then it could free the context after fdset_del. + */ +void * +fdset_event_dispatch(void *arg) +{ + int i; + struct pollfd *pfd; + struct fdentry *pfdentry; + fd_cb rcb, wcb; + void *dat; + int fd, numfds; + int remove1, remove2; + int need_shrink; + struct fdset *pfdset = arg; + + if (pfdset == NULL) + return NULL; + + while (1) { + + /* + * When poll is blocked, other threads might unregister + * listenfds from and register new listenfds into fdset. + * When poll returns, the entries for listenfds in the fdset + * might have been updated. It is ok if there is unwanted call + * for new listenfds. + */ + pthread_mutex_lock(&pfdset->fd_mutex); + numfds = pfdset->num; + pthread_mutex_unlock(&pfdset->fd_mutex); + + poll(pfdset->rwfds, numfds, 1000 /* millisecs */); + + need_shrink = 0; + for (i = 0; i < numfds; i++) { + pthread_mutex_lock(&pfdset->fd_mutex); + + pfdentry = &pfdset->fd[i]; + fd = pfdentry->fd; + pfd = &pfdset->rwfds[i]; + + if (fd < 0) { + need_shrink = 1; + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + if (!pfd->revents) { + pthread_mutex_unlock(&pfdset->fd_mutex); + continue; + } + + remove1 = remove2 = 0; + + rcb = pfdentry->rcb; + wcb = pfdentry->wcb; + dat = pfdentry->dat; + pfdentry->busy = 1; + + pthread_mutex_unlock(&pfdset->fd_mutex); + + if (rcb && pfd->revents & (POLLIN | FDPOLLERR)) + rcb(fd, dat, &remove1); + if (wcb && pfd->revents & (POLLOUT | FDPOLLERR)) + wcb(fd, dat, &remove2); + pfdentry->busy = 0; + /* + * fdset_del needs to check busy flag. + * We don't allow fdset_del to be called in callback + * directly. + */ + /* + * When we are to clean up the fd from fdset, + * because the fd is closed in the cb, + * the old fd val could be reused by when creates new + * listen fd in another thread, we couldn't call + * fd_set_del. + */ + if (remove1 || remove2) { + pfdentry->fd = -1; + need_shrink = 1; + } + } + + if (need_shrink) + fdset_shrink(pfdset); + } + + return NULL; +} diff --git a/src/spdk/lib/vhost/rte_vhost/fd_man.h b/src/spdk/lib/vhost/rte_vhost/fd_man.h new file mode 100644 index 00000000..3a9d269b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/fd_man.h @@ -0,0 +1,69 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FD_MAN_H_ +#define _FD_MAN_H_ +#include +#include +#include + +#define MAX_FDS 1024 + +typedef void (*fd_cb)(int fd, void *dat, int *remove); + +struct fdentry { + int fd; /* -1 indicates this entry is empty */ + fd_cb rcb; /* callback when this fd is readable. */ + fd_cb wcb; /* callback when this fd is writeable. */ + void *dat; /* fd context */ + int busy; /* whether this entry is being used in cb. */ +}; + +struct fdset { + struct pollfd rwfds[MAX_FDS]; + struct fdentry fd[MAX_FDS]; + pthread_mutex_t fd_mutex; + int num; /* current fd number of this fdset */ +}; + + +void fdset_init(struct fdset *pfdset); + +int fdset_add(struct fdset *pfdset, int fd, + fd_cb rcb, fd_cb wcb, void *dat); + +void *fdset_del(struct fdset *pfdset, int fd); + +void *fdset_event_dispatch(void *arg); + +#endif diff --git a/src/spdk/lib/vhost/rte_vhost/rte_vhost.h b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h new file mode 100644 index 00000000..29f5b613 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/rte_vhost.h @@ -0,0 +1,474 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2017 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _RTE_VHOST_H_ +#define _RTE_VHOST_H_ + +/** + * @file + * Interface to vhost-user + */ + +#include +#include +#include +#include + +#include +#include +#include + +#define RTE_VHOST_USER_CLIENT (1ULL << 0) +#define RTE_VHOST_USER_NO_RECONNECT (1ULL << 1) +#define RTE_VHOST_USER_DEQUEUE_ZERO_COPY (1ULL << 2) + +/** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. + */ +struct rte_vhost_mem_region { + uint64_t guest_phys_addr; + uint64_t guest_user_addr; + uint64_t host_user_addr; + uint64_t size; + void *mmap_addr; + uint64_t mmap_size; + int fd; +}; + +/** + * Memory structure includes region and mapping information. + */ +struct rte_vhost_memory { + uint32_t nregions; + struct rte_vhost_mem_region regions[0]; +}; + +struct rte_vhost_vring { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint64_t log_guest_addr; + + int callfd; + int kickfd; + uint16_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +}; + +/** + * Device and vring operations. + */ +struct vhost_device_ops { + int (*new_device)(int vid); /**< Add device. */ + void (*destroy_device)(int vid); /**< Remove device. */ + + int (*vring_state_changed)(int vid, uint16_t queue_id, int enable); /**< triggered when a vring is enabled or disabled */ + + /** + * Features could be changed after the feature negotiation. + * For example, VHOST_F_LOG_ALL will be set/cleared at the + * start/end of live migration, respectively. This callback + * is used to inform the application on such change. + */ + int (*features_changed)(int vid, uint64_t features); + int (*vhost_nvme_admin_passthrough)(int vid, void *cmd, void *cqe, void *buf); + int (*vhost_nvme_set_cq_call)(int vid, uint16_t qid, int fd); + int (*vhost_nvme_get_cap)(int vid, uint64_t *cap); + + int (*new_connection)(int vid); + void (*destroy_connection)(int vid); + + int (*get_config)(int vid, uint8_t *config, uint32_t config_len); + int (*set_config)(int vid, uint8_t *config, uint32_t offset, + uint32_t len, uint32_t flags); + + void *reserved[2]; /**< Reserved for future extension */ +}; + +/** + * Convert guest physical address to host virtual address + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @return + * the host virtual address on success, 0 on failure + */ +static inline uint64_t __attribute__((always_inline)) +rte_vhost_gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + reg = &mem->regions[i]; + if (gpa >= reg->guest_phys_addr && + gpa < reg->guest_phys_addr + reg->size) { + return gpa - reg->guest_phys_addr + + reg->host_user_addr; + } + } + + return 0; +} + +/** + * Convert guest physical address to host virtual address safely + * + * This variant of rte_vhost_gpa_to_vva() takes care all the + * requested length is mapped and contiguous in process address + * space. + * + * @param mem + * the guest memory regions + * @param gpa + * the guest physical address for querying + * @param len + * the size of the requested area to map, + * updated with actual size mapped + * @return + * the host virtual address on success, 0 on failure */ +static inline uint64_t +rte_vhost_va_from_guest_pa(struct rte_vhost_memory *mem, + uint64_t gpa, uint64_t *len) +{ + struct rte_vhost_mem_region *r; + uint32_t i; + + for (i = 0; i < mem->nregions; i++) { + r = &mem->regions[i]; + if (gpa >= r->guest_phys_addr && + gpa < r->guest_phys_addr + r->size) { + + if (unlikely(*len > r->guest_phys_addr + r->size - gpa)) + *len = r->guest_phys_addr + r->size - gpa; + + return gpa - r->guest_phys_addr + + r->host_user_addr; + } + } + *len = 0; + + return 0; +} + +#define RTE_VHOST_NEED_LOG(features) ((features) & (1ULL << VHOST_F_LOG_ALL)) + +/** + * Log the memory write start with given address. + * + * This function only need be invoked when the live migration starts. + * Therefore, we won't need call it at all in the most of time. For + * making the performance impact be minimum, it's suggested to do a + * check before calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_write(vid, addr, len); + * + * @param vid + * vhost device ID + * @param addr + * the starting address for write + * @param len + * the length to write + */ +void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len); + +/** + * Log the used ring update start at given offset. + * + * Same as rte_vhost_log_write, it's suggested to do a check before + * calling it: + * + * if (unlikely(RTE_VHOST_NEED_LOG(features))) + * rte_vhost_log_used_vring(vid, vring_idx, offset, len); + * + * @param vid + * vhost device ID + * @param vring_idx + * the vring index + * @param offset + * the offset inside the used ring + * @param len + * the length to write + */ +void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len); + +int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); + +/** + * Register vhost driver. path could be different for multiple + * instance support. + */ +int rte_vhost_driver_register(const char *path, uint64_t flags); + +/* Unregister vhost driver. This is only meaningful to vhost user. */ +int rte_vhost_driver_unregister(const char *path); + +/** + * Set the feature bits the vhost-user driver supports. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_set_features(const char *path, uint64_t features); + +/** + * Enable vhost-user driver features. + * + * Note that + * - the param @features should be a subset of the feature bits provided + * by rte_vhost_driver_set_features(). + * - it must be invoked before vhost-user negotiation starts. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to enable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_enable_features(const char *path, uint64_t features); + +/** + * Disable vhost-user driver features. + * + * The two notes at rte_vhost_driver_enable_features() also apply here. + * + * @param path + * The vhost-user socket file path + * @param features + * Features to disable + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_disable_features(const char *path, uint64_t features); + +/** + * Get the feature bits before feature negotiation. + * + * @param path + * The vhost-user socket file path + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_get_features(const char *path, uint64_t *features); + +/** + * Get the feature bits after negotiation + * + * @param vid + * Vhost device ID + * @param features + * A pointer to store the queried feature bits + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_negotiated_features(int vid, uint64_t *features); + +/* Register callbacks. */ +int rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops); + +/** + * + * Start the vhost-user driver. + * + * This function triggers the vhost-user negotiation. + * + * @param path + * The vhost-user socket file path + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_driver_start(const char *path); + +/** + * Get the MTU value of the device if set in QEMU. + * + * @param vid + * virtio-net device ID + * @param mtu + * The variable to store the MTU value + * + * @return + * 0: success + * -EAGAIN: device not yet started + * -ENOTSUP: device does not support MTU feature + */ +int rte_vhost_get_mtu(int vid, uint16_t *mtu); + +/** + * Get the numa node from which the virtio net device's memory + * is allocated. + * + * @param vid + * vhost device ID + * + * @return + * The numa node, -1 on failure + */ +int rte_vhost_get_numa_node(int vid); + +/** + * Get the virtio net device's ifname, which is the vhost-user socket + * file path. + * + * @param vid + * vhost device ID + * @param buf + * The buffer to stored the queried ifname + * @param len + * The length of buf + * + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_ifname(int vid, char *buf, size_t len); + +/** + * Get how many avail entries are left in the queue + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + * + * @return + * num of avail entires left + */ +uint16_t rte_vhost_avail_entries(int vid, uint16_t queue_id); + +struct rte_mbuf; +struct rte_mempool; +/** + * This function adds buffers to the virtio devices RX virtqueue. Buffers can + * be received from the physical port or from another virtual device. A packet + * count is returned to indicate the number of packets that were succesfully + * added to the RX queue. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param pkts + * array to contain packets to be enqueued + * @param count + * packets num to be enqueued + * @return + * num of packets enqueued + */ +uint16_t rte_vhost_enqueue_burst(int vid, uint16_t queue_id, + struct rte_mbuf **pkts, uint16_t count); + +/** + * This function gets guest buffers from the virtio device TX virtqueue, + * construct host mbufs, copies guest buffer content to host mbufs and + * store them in pkts to be processed. + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index in mq case + * @param mbuf_pool + * mbuf_pool where host mbuf is allocated. + * @param pkts + * array to contain packets to be dequeued + * @param count + * packets num to be dequeued + * @return + * num of packets dequeued + */ +uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id, + struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count); + +/** + * Get guest mem table: a list of memory regions. + * + * An rte_vhost_vhost_memory object will be allocated internaly, to hold the + * guest memory regions. Application should free it at destroy_device() + * callback. + * + * @param vid + * vhost device ID + * @param mem + * To store the returned mem regions + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem); + +/** + * Get guest vring info, including the vring address, vring size, etc. + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param vring + * the structure to hold the requested vring info + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring); + +/** + * Set id of the last descriptors in avail and used guest vrings. + * + * In case user application operates directly on buffers, it should use this + * function on device destruction to retrieve the same values later on in device + * creation via rte_vhost_get_vhost_vring(int, uint16_t, struct rte_vhost_vring *) + * + * @param vid + * vhost device ID + * @param vring_idx + * vring index + * @param last_avail_idx + * id of the last descriptor in avail ring to be set + * @param last_used_idx + * id of the last descriptor in used ring to be set + * @return + * 0 on success, -1 on failure + */ +int rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx); + +#endif /* _RTE_VHOST_H_ */ diff --git a/src/spdk/lib/vhost/rte_vhost/socket.c b/src/spdk/lib/vhost/rte_vhost/socket.c new file mode 100644 index 00000000..1bc1e64b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/socket.c @@ -0,0 +1,819 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "fd_man.h" +#include "vhost.h" +#include "vhost_user.h" + + +TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection); + +/* + * Every time rte_vhost_driver_register() is invoked, an associated + * vhost_user_socket struct will be created. + */ +struct vhost_user_socket { + struct vhost_user_connection_list conn_list; + pthread_mutex_t conn_mutex; + char *path; + int socket_fd; + struct sockaddr_un un; + bool is_server; + bool reconnect; + bool dequeue_zero_copy; + + /* + * The "supported_features" indicates the feature bits the + * vhost driver supports. The "features" indicates the feature + * bits after the rte_vhost_driver_features_disable/enable(). + * It is also the final feature bits used for vhost-user + * features negotiation. + */ + uint64_t supported_features; + uint64_t features; + + struct vhost_device_ops const *notify_ops; +}; + +struct vhost_user_connection { + struct vhost_user_socket *vsocket; + int connfd; + int vid; + + TAILQ_ENTRY(vhost_user_connection) next; +}; + +#define MAX_VHOST_SOCKET 1024 +struct vhost_user { + struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET]; + struct fdset fdset; + int vsocket_cnt; + pthread_mutex_t mutex; +}; + +#define MAX_VIRTIO_BACKLOG 128 + +static void vhost_user_server_new_connection(int fd, void *data, int *remove); +static void vhost_user_read_cb(int fd, void *dat, int *remove); +static int create_unix_socket(struct vhost_user_socket *vsocket); +static int vhost_user_start_client(struct vhost_user_socket *vsocket); + +static struct vhost_user vhost_user = { + .fdset = { + .fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} }, + .fd_mutex = PTHREAD_MUTEX_INITIALIZER, + .num = 0 + }, + .vsocket_cnt = 0, + .mutex = PTHREAD_MUTEX_INITIALIZER, +}; + +/* return bytes# of read on success or negative val on failure. */ +int +read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { + RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + + if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) { + RTE_LOG(ERR, VHOST_CONFIG, "truncted msg\n"); + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL; + cmsg = CMSG_NXTHDR(&msgh, cmsg)) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + memcpy(fds, CMSG_DATA(cmsg), fdsize); + break; + } + } + + return ret; +} + +int +send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num) +{ + + struct iovec iov; + struct msghdr msgh; + size_t fdsize = fd_num * sizeof(int); + char control[CMSG_SPACE(fdsize)]; + struct cmsghdr *cmsg; + int ret; + + memset(&msgh, 0, sizeof(msgh)); + iov.iov_base = buf; + iov.iov_len = buflen; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + if (cmsg == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, "cmsg == NULL\n"); + errno = EINVAL; + return -1; + } + cmsg->cmsg_len = CMSG_LEN(fdsize); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fdsize); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + ret = sendmsg(sockfd, &msgh, 0); + } while (ret < 0 && errno == EINTR); + + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "sendmsg error\n"); + return ret; + } + + return ret; +} + +static void +vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket) +{ + int vid; + size_t size; + struct vhost_user_connection *conn; + int ret; + + conn = malloc(sizeof(*conn)); + if (conn == NULL) { + close(fd); + return; + } + + vid = vhost_new_device(vsocket->features); + if (vid == -1) { + goto err; + } + + size = strnlen(vsocket->path, PATH_MAX); + vhost_set_ifname(vid, vsocket->path, size); + + if (vsocket->dequeue_zero_copy) + vhost_enable_dequeue_zero_copy(vid); + + RTE_LOG(INFO, VHOST_CONFIG, "new device, handle is %d\n", vid); + + if (vsocket->notify_ops->new_connection) { + ret = vsocket->notify_ops->new_connection(vid); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add vhost user connection with fd %d\n", + fd); + goto err; + } + } + + conn->connfd = fd; + conn->vsocket = vsocket; + conn->vid = vid; + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb, + NULL, conn); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add fd %d into vhost server fdset\n", + fd); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + goto err; + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + return; + +err: + free(conn); + close(fd); +} + +/* call back when there is new vhost-user connection from client */ +static void +vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused) +{ + struct vhost_user_socket *vsocket = dat; + + fd = accept(fd, NULL, NULL); + if (fd < 0) + return; + + RTE_LOG(INFO, VHOST_CONFIG, "new vhost user connection is %d\n", fd); + vhost_user_add_connection(fd, vsocket); +} + +static void +vhost_user_read_cb(int connfd, void *dat, int *remove) +{ + struct vhost_user_connection *conn = dat; + struct vhost_user_socket *vsocket = conn->vsocket; + int ret; + + ret = vhost_user_msg_handler(conn->vid, connfd); + if (ret < 0) { + close(connfd); + *remove = 1; + vhost_destroy_device(conn->vid); + + if (vsocket->notify_ops->destroy_connection) + vsocket->notify_ops->destroy_connection(conn->vid); + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); + + if (vsocket->reconnect) { + create_unix_socket(vsocket); + vhost_user_start_client(vsocket); + } + } +} + +static int +create_unix_socket(struct vhost_user_socket *vsocket) +{ + int fd; + struct sockaddr_un *un = &vsocket->un; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + RTE_LOG(INFO, VHOST_CONFIG, "vhost-user %s: socket created, fd: %d\n", + vsocket->is_server ? "server" : "client", fd); + + if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "vhost-user: can't set nonblocking mode for socket, fd: " + "%d (%s)\n", fd, strerror(errno)); + close(fd); + return -1; + } + + memset(un, 0, sizeof(*un)); + un->sun_family = AF_UNIX; + strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path)); + un->sun_path[sizeof(un->sun_path) - 1] = '\0'; + + vsocket->socket_fd = fd; + return 0; +} + +static int +vhost_user_start_server(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + + ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un)); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to bind to %s: %s; remove it and try again\n", + path, strerror(errno)); + goto err; + } + RTE_LOG(INFO, VHOST_CONFIG, "bind to %s\n", path); + + ret = listen(fd, MAX_VIRTIO_BACKLOG); + if (ret < 0) + goto err; + + ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection, + NULL, vsocket); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to add listen fd %d to vhost server fdset\n", + fd); + goto err; + } + + return 0; + +err: + close(fd); + return -1; +} + +struct vhost_user_reconnect { + struct sockaddr_un un; + int fd; + struct vhost_user_socket *vsocket; + + TAILQ_ENTRY(vhost_user_reconnect) next; +}; + +TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect); +struct vhost_user_reconnect_list { + struct vhost_user_reconnect_tailq_list head; + pthread_mutex_t mutex; +}; + +static struct vhost_user_reconnect_list reconn_list; +static pthread_t reconn_tid; + +static int +vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz) +{ + int ret, flags; + + ret = connect(fd, un, sz); + if (ret < 0 && errno != EISCONN) + return -1; + + flags = fcntl(fd, F_GETFL, 0); + if (flags < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't get flags for connfd %d\n", fd); + return -2; + } + if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) { + RTE_LOG(ERR, VHOST_CONFIG, + "can't disable nonblocking on fd %d\n", fd); + return -2; + } + return 0; +} + +static void * +vhost_user_client_reconnect(void *arg __rte_unused) +{ + int ret; + struct vhost_user_reconnect *reconn, *next; + + while (1) { + pthread_mutex_lock(&reconn_list.mutex); + + /* + * An equal implementation of TAILQ_FOREACH_SAFE, + * which does not exist on all platforms. + */ + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + ret = vhost_user_connect_nonblock(reconn->fd, + (struct sockaddr *)&reconn->un, + sizeof(reconn->un)); + if (ret == -2) { + close(reconn->fd); + RTE_LOG(ERR, VHOST_CONFIG, + "reconnection for fd %d failed\n", + reconn->fd); + goto remove_fd; + } + if (ret == -1) + continue; + + RTE_LOG(INFO, VHOST_CONFIG, + "%s: connected\n", reconn->vsocket->path); + vhost_user_add_connection(reconn->fd, reconn->vsocket); +remove_fd: + TAILQ_REMOVE(&reconn_list.head, reconn, next); + free(reconn); + } + + pthread_mutex_unlock(&reconn_list.mutex); + sleep(1); + } + + return NULL; +} + +static int +vhost_user_reconnect_init(void) +{ + int ret; + + pthread_mutex_init(&reconn_list.mutex, NULL); + TAILQ_INIT(&reconn_list.head); + + ret = pthread_create(&reconn_tid, NULL, + vhost_user_client_reconnect, NULL); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, "failed to create reconnect thread"); + + return ret; +} + +static int +vhost_user_start_client(struct vhost_user_socket *vsocket) +{ + int ret; + int fd = vsocket->socket_fd; + const char *path = vsocket->path; + struct vhost_user_reconnect *reconn; + + ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un, + sizeof(vsocket->un)); + if (ret == 0) { + vhost_user_add_connection(fd, vsocket); + return 0; + } + + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to connect to %s: %s\n", + path, strerror(errno)); + + if (ret == -2 || !vsocket->reconnect) { + close(fd); + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: reconnecting...\n", path); + reconn = malloc(sizeof(*reconn)); + if (reconn == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for reconnect\n"); + close(fd); + return -1; + } + reconn->un = vsocket->un; + reconn->fd = fd; + reconn->vsocket = vsocket; + pthread_mutex_lock(&reconn_list.mutex); + TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next); + pthread_mutex_unlock(&reconn_list.mutex); + + return 0; +} + +static struct vhost_user_socket * +find_vhost_user_socket(const char *path) +{ + int i; + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) + return vsocket; + } + + return NULL; +} + +int +rte_vhost_driver_disable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->features &= ~features; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_enable_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + if ((vsocket->supported_features & features) != features) { + /* + * trying to enable features the driver doesn't + * support. + */ + pthread_mutex_unlock(&vhost_user.mutex); + return -1; + } + vsocket->features |= features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_set_features(const char *path, uint64_t features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) { + vsocket->supported_features = features; + vsocket->features = features; + } + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +int +rte_vhost_driver_get_features(const char *path, uint64_t *features) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + *features = vsocket->features; + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) { + RTE_LOG(ERR, VHOST_CONFIG, + "socket file %s is not registered yet.\n", path); + return -1; + } else { + return 0; + } +} + +/* + * Register a new vhost-user socket; here we could act as server + * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag + * is set. + */ +int +rte_vhost_driver_register(const char *path, uint64_t flags) +{ + int ret = -1; + struct vhost_user_socket *vsocket; + + if (!path) + return -1; + + pthread_mutex_lock(&vhost_user.mutex); + + if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) { + RTE_LOG(ERR, VHOST_CONFIG, + "error: the number of vhost sockets reaches maximum\n"); + goto out; + } + + vsocket = malloc(sizeof(struct vhost_user_socket)); + if (!vsocket) + goto out; + memset(vsocket, 0, sizeof(struct vhost_user_socket)); + vsocket->path = strdup(path); + if (!vsocket->path) { + free(vsocket); + goto out; + } + TAILQ_INIT(&vsocket->conn_list); + pthread_mutex_init(&vsocket->conn_mutex, NULL); + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + + /* + * Set the supported features correctly for the builtin vhost-user + * net driver. + * + * Applications know nothing about features the builtin virtio net + * driver (virtio_net.c) supports, thus it's not possible for them + * to invoke rte_vhost_driver_set_features(). To workaround it, here + * we set it unconditionally. If the application want to implement + * another vhost-user driver (say SCSI), it should call the + * rte_vhost_driver_set_features(), which will overwrite following + * two values. + */ + vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES; + vsocket->features = VIRTIO_NET_SUPPORTED_FEATURES; + + if ((flags & RTE_VHOST_USER_CLIENT) != 0) { + vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT); + if (vsocket->reconnect && reconn_tid == 0) { + if (vhost_user_reconnect_init() < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + } + } else { + vsocket->is_server = true; + } + ret = create_unix_socket(vsocket); + if (ret < 0) { + free(vsocket->path); + free(vsocket); + goto out; + } + + vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket; + +out: + pthread_mutex_unlock(&vhost_user.mutex); + + return ret; +} + +static bool +vhost_user_remove_reconnect(struct vhost_user_socket *vsocket) +{ + int found = false; + struct vhost_user_reconnect *reconn, *next; + + pthread_mutex_lock(&reconn_list.mutex); + + for (reconn = TAILQ_FIRST(&reconn_list.head); + reconn != NULL; reconn = next) { + next = TAILQ_NEXT(reconn, next); + + if (reconn->vsocket == vsocket) { + TAILQ_REMOVE(&reconn_list.head, reconn, next); + close(reconn->fd); + free(reconn); + found = true; + break; + } + } + pthread_mutex_unlock(&reconn_list.mutex); + return found; +} + +/** + * Unregister the specified vhost socket + */ +int +rte_vhost_driver_unregister(const char *path) +{ + int i; + int count; + struct vhost_user_connection *conn; + + pthread_mutex_lock(&vhost_user.mutex); + + for (i = 0; i < vhost_user.vsocket_cnt; i++) { + struct vhost_user_socket *vsocket = vhost_user.vsockets[i]; + + if (!strcmp(vsocket->path, path)) { + if (vsocket->is_server) { + fdset_del(&vhost_user.fdset, vsocket->socket_fd); + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { + vhost_user_remove_reconnect(vsocket); + } + + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_FOREACH(conn, &vsocket->conn_list, next) { + close(conn->connfd); + } + pthread_mutex_unlock(&vsocket->conn_mutex); + + do { + pthread_mutex_lock(&vsocket->conn_mutex); + conn = TAILQ_FIRST(&vsocket->conn_list); + pthread_mutex_unlock(&vsocket->conn_mutex); + } while (conn != NULL); + + free(vsocket->path); + free(vsocket); + + count = --vhost_user.vsocket_cnt; + vhost_user.vsockets[i] = vhost_user.vsockets[count]; + vhost_user.vsockets[count] = NULL; + pthread_mutex_unlock(&vhost_user.mutex); + + return 0; + } + } + pthread_mutex_unlock(&vhost_user.mutex); + + return -1; +} + +/* + * Register ops so that we can add/remove device to data core. + */ +int +rte_vhost_driver_callback_register(const char *path, + struct vhost_device_ops const * const ops) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + if (vsocket) + vsocket->notify_ops = ops; + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? 0 : -1; +} + +struct vhost_device_ops const * +vhost_driver_callback_get(const char *path) +{ + struct vhost_user_socket *vsocket; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + return vsocket ? vsocket->notify_ops : NULL; +} + +int +rte_vhost_driver_start(const char *path) +{ + struct vhost_user_socket *vsocket; + static pthread_t fdset_tid; + + pthread_mutex_lock(&vhost_user.mutex); + vsocket = find_vhost_user_socket(path); + pthread_mutex_unlock(&vhost_user.mutex); + + if (!vsocket) + return -1; + + if (fdset_tid == 0) { + int ret = pthread_create(&fdset_tid, NULL, fdset_event_dispatch, + &vhost_user.fdset); + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "failed to create fdset handling thread"); + } + + if (vsocket->is_server) + return vhost_user_start_server(vsocket); + else + return vhost_user_start_client(vsocket); +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.c b/src/spdk/lib/vhost/rte_vhost/vhost.c new file mode 100644 index 00000000..9d4ae71b --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost.c @@ -0,0 +1,482 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +struct virtio_net * +get_device(int vid) +{ + struct virtio_net *dev = vhost_devices[vid]; + + if (unlikely(!dev)) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) device not found.\n", vid); + } + + return dev; +} + +static void +cleanup_vq(struct vhost_virtqueue *vq, int destroy) +{ + if ((vq->callfd >= 0) && (destroy != 0)) + close(vq->callfd); + if (vq->kickfd >= 0) + close(vq->kickfd); +} + +/* + * Unmap any memory, close any file descriptors and + * free any memory owned by a device. + */ +void +cleanup_device(struct virtio_net *dev, int destroy) +{ + uint32_t i; + + vhost_backend_cleanup(dev); + + for (i = 0; i < dev->nr_vring; i++) + cleanup_vq(dev->virtqueue[i], destroy); +} + +/* + * Release virtqueues and device memory. + */ +static void +free_device(struct virtio_net *dev) +{ + uint32_t i; + struct vhost_virtqueue *vq; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + rte_free(vq->shadow_used_ring); + + rte_free(vq); + } + + rte_free(dev); +} + +static void +init_vring_queue(struct vhost_virtqueue *vq) +{ + memset(vq, 0, sizeof(struct vhost_virtqueue)); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + /* Backends are set to -1 indicating an inactive device. */ + vq->backend = -1; + + /* + * always set the vq to enabled; this is to keep compatibility + * with the old QEMU, whereas there is no SET_VRING_ENABLE message. + */ + vq->enabled = 1; + + TAILQ_INIT(&vq->zmbuf_list); +} + +static void +reset_vring_queue(struct vhost_virtqueue *vq) +{ + int callfd; + + callfd = vq->callfd; + init_vring_queue(vq); + vq->callfd = callfd; +} + +int +alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx) +{ + struct vhost_virtqueue *vq; + + vq = rte_malloc(NULL, sizeof(struct vhost_virtqueue), 0); + if (vq == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for vring:%u.\n", vring_idx); + return -1; + } + + dev->virtqueue[vring_idx] = vq; + init_vring_queue(vq); + + dev->nr_vring += 1; + + return 0; +} + +/* + * Reset some variables in device structure, while keeping few + * others untouched, such as vid, ifname, nr_vring: they + * should be same unless the device is removed. + */ +void +reset_device(struct virtio_net *dev) +{ + uint32_t i; + + dev->negotiated_features = 0; + dev->protocol_features = 0; + dev->flags = 0; + + for (i = 0; i < dev->nr_vring; i++) + reset_vring_queue(dev->virtqueue[i]); +} + +/* + * Invoked when there is a new vhost-user connection established (when + * there is a new virtio device being attached). + */ +int +vhost_new_device(uint64_t features) +{ + struct virtio_net *dev; + int i; + + dev = rte_zmalloc(NULL, sizeof(struct virtio_net), 0); + if (dev == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to allocate memory for new dev.\n"); + return -1; + } + + for (i = 0; i < MAX_VHOST_DEVICE; i++) { + if (vhost_devices[i] == NULL) + break; + } + if (i == MAX_VHOST_DEVICE) { + RTE_LOG(ERR, VHOST_CONFIG, + "Failed to find a free slot for new device.\n"); + rte_free(dev); + return -1; + } + + vhost_devices[i] = dev; + dev->vid = i; + dev->features = features; + + return i; +} + +/* + * Invoked when there is the vhost-user connection is broken (when + * the virtio device is being detached). + */ +void +vhost_destroy_device(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(vid); + } + + cleanup_device(dev, 1); + free_device(dev); + + vhost_devices[vid] = NULL; +} + +void +vhost_set_ifname(int vid, const char *if_name, unsigned int if_len) +{ + struct virtio_net *dev; + unsigned int len; + + dev = get_device(vid); + if (dev == NULL) + return; + + len = if_len > sizeof(dev->ifname) ? + sizeof(dev->ifname) : if_len; + + strncpy(dev->ifname, if_name, len); + dev->ifname[sizeof(dev->ifname) - 1] = '\0'; +} + +void +vhost_enable_dequeue_zero_copy(int vid) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + dev->dequeue_zero_copy = 1; +} + +int +rte_vhost_get_mtu(int vid, uint16_t *mtu) +{ + struct virtio_net *dev = get_device(vid); + + if (!dev) + return -ENODEV; + + if (!(dev->flags & VIRTIO_DEV_READY)) + return -EAGAIN; + + if (!(dev->negotiated_features & VIRTIO_NET_F_MTU)) + return -ENOTSUP; + + *mtu = dev->mtu; + + return 0; +} + +int +rte_vhost_get_numa_node(int vid) +{ +#ifdef RTE_LIBRTE_VHOST_NUMA + struct virtio_net *dev = get_device(vid); + int numa_node; + int ret; + + if (dev == NULL) + return -1; + + ret = get_mempolicy(&numa_node, NULL, 0, dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to query numa node: %d\n", vid, ret); + return -1; + } + + return numa_node; +#else + RTE_SET_USED(vid); + return -1; +#endif +} + +int +rte_vhost_get_ifname(int vid, char *buf, size_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + len = RTE_MIN(len, sizeof(dev->ifname)); + + strncpy(buf, dev->ifname, len); + buf[len - 1] = '\0'; + + return 0; +} + +int +rte_vhost_get_negotiated_features(int vid, uint64_t *features) +{ + struct virtio_net *dev; + + dev = get_device(vid); + if (!dev) + return -1; + + *features = dev->negotiated_features; + return 0; +} + +int +rte_vhost_get_mem_table(int vid, struct rte_vhost_memory **mem) +{ + struct virtio_net *dev; + struct rte_vhost_memory *m; + size_t size; + + dev = get_device(vid); + if (!dev) + return -1; + + size = dev->mem->nregions * sizeof(struct rte_vhost_mem_region); + m = malloc(sizeof(struct rte_vhost_memory) + size); + if (!m) + return -1; + + m->nregions = dev->mem->nregions; + memcpy(m->regions, dev->mem->regions, size); + *mem = m; + + return 0; +} + +int +rte_vhost_get_vhost_vring(int vid, uint16_t vring_idx, + struct rte_vhost_vring *vring) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vring->desc = vq->desc; + vring->avail = vq->avail; + vring->used = vq->used; + vring->log_guest_addr = vq->log_guest_addr; + + vring->callfd = vq->callfd; + vring->kickfd = vq->kickfd; + vring->size = vq->size; + + vring->last_avail_idx = vq->last_avail_idx; + vring->last_used_idx = vq->last_used_idx; + + return 0; +} + +uint16_t +rte_vhost_avail_entries(int vid, uint16_t queue_id) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return 0; + + vq = dev->virtqueue[queue_id]; + if (!vq->enabled) + return 0; + + return *(volatile uint16_t *)&vq->avail->idx - vq->last_used_idx; +} + +int +rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return -1; + + if (enable) { + RTE_LOG(ERR, VHOST_CONFIG, + "guest notification isn't supported.\n"); + return -1; + } + + dev->virtqueue[queue_id]->used->flags = VRING_USED_F_NO_NOTIFY; + return 0; +} + +void +rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) +{ + struct virtio_net *dev = get_device(vid); + + if (dev == NULL) + return; + + vhost_log_write(dev, addr, len); +} + +void +rte_vhost_log_used_vring(int vid, uint16_t vring_idx, + uint64_t offset, uint64_t len) +{ + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (dev == NULL) + return; + + if (vring_idx >= VHOST_MAX_VRING) + return; + vq = dev->virtqueue[vring_idx]; + if (!vq) + return; + + vhost_log_used_vring(dev, vq, offset, len); +} + +int +rte_vhost_set_vhost_vring_last_idx(int vid, uint16_t vring_idx, + uint16_t last_avail_idx, uint16_t last_used_idx) { + struct virtio_net *dev; + struct vhost_virtqueue *vq; + + dev = get_device(vid); + if (!dev) + return -1; + + if (vring_idx >= VHOST_MAX_VRING) + return -1; + + vq = dev->virtqueue[vring_idx]; + if (!vq) + return -1; + + vq->last_avail_idx = last_avail_idx; + vq->last_used_idx = last_used_idx; + + return 0; +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost.h b/src/spdk/lib/vhost/rte_vhost/vhost.h new file mode 100644 index 00000000..b0a0201d --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost.h @@ -0,0 +1,321 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_CDEV_H_ +#define _VHOST_NET_CDEV_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "rte_vhost.h" +#include "vhost_user.h" + +/* Used to indicate that the device is running on a data core */ +#define VIRTIO_DEV_RUNNING 1 +/* Used to indicate that the device is ready to operate */ +#define VIRTIO_DEV_READY 2 + +/* Backend value set by guest. */ +#define VIRTIO_DEV_STOPPED -1 + +#define BUF_VECTOR_MAX 256 + +/** + * Structure contains buffer address, length and descriptor index + * from vring to do scatter RX. + */ +struct buf_vector { + uint64_t buf_addr; + uint32_t buf_len; + uint32_t desc_idx; +}; + +/* + * A structure to hold some fields needed in zero copy code path, + * mainly for associating an mbuf with the right desc_idx. + */ +struct zcopy_mbuf { + struct rte_mbuf *mbuf; + uint32_t desc_idx; + uint16_t in_use; + + TAILQ_ENTRY(zcopy_mbuf) next; +}; +TAILQ_HEAD(zcopy_mbuf_list, zcopy_mbuf); + +/** + * Structure contains variables relevant to RX/TX virtqueues. + */ +struct vhost_virtqueue { + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; + uint32_t size; + + uint16_t last_avail_idx; + uint16_t last_used_idx; +#define VIRTIO_INVALID_EVENTFD (-1) +#define VIRTIO_UNINITIALIZED_EVENTFD (-2) + + /* Backend value to determine if device should started/stopped */ + int backend; + /* Used to notify the guest (trigger interrupt) */ + int callfd; + /* Currently unused as polling mode is enabled */ + int kickfd; + int enabled; + + /* Physical address of used ring, for logging */ + uint64_t log_guest_addr; + + uint16_t nr_zmbuf; + uint16_t zmbuf_size; + uint16_t last_zmbuf_idx; + struct zcopy_mbuf *zmbufs; + struct zcopy_mbuf_list zmbuf_list; + + struct vring_used_elem *shadow_used_ring; + uint16_t shadow_used_idx; +} __rte_cache_aligned; + +/* Old kernels have no such macros defined */ +#ifndef VIRTIO_NET_F_GUEST_ANNOUNCE + #define VIRTIO_NET_F_GUEST_ANNOUNCE 21 +#endif + +#ifndef VIRTIO_NET_F_MQ + #define VIRTIO_NET_F_MQ 22 +#endif + +#define VHOST_MAX_VRING 0x100 +#define VHOST_MAX_QUEUE_PAIRS 0x80 + +#ifndef VIRTIO_NET_F_MTU + #define VIRTIO_NET_F_MTU 3 +#endif + +/* + * Define virtio 1.0 for older kernels + */ +#ifndef VIRTIO_F_VERSION_1 + #define VIRTIO_F_VERSION_1 32 +#endif + +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +/* Features supported by this builtin vhost-user net driver. */ +#define VIRTIO_NET_SUPPORTED_FEATURES ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | \ + (1ULL << VIRTIO_NET_F_CTRL_VQ) | \ + (1ULL << VIRTIO_NET_F_CTRL_RX) | \ + (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \ + (1ULL << VIRTIO_NET_F_MQ) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO4) | \ + (1ULL << VIRTIO_NET_F_HOST_TSO6) | \ + (1ULL << VIRTIO_NET_F_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_CSUM) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \ + (1ULL << VIRTIO_NET_F_MTU)) + + +struct guest_page { + uint64_t guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; +}; + +/** + * Device structure contains all configuration information relating + * to the device. + */ +struct virtio_net { + /* Frontend (QEMU) memory and memory region information */ + struct rte_vhost_memory *mem; + uint64_t features; + uint64_t negotiated_features; + uint64_t protocol_features; + int vid; + uint32_t is_nvme; + uint32_t flags; + uint16_t vhost_hlen; + /* to tell if we need broadcast rarp packet */ + rte_atomic16_t broadcast_rarp; + uint32_t nr_vring; + int dequeue_zero_copy; + struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; +#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + char ifname[IF_NAME_SZ]; + uint64_t log_size; + uint64_t log_base; + uint64_t log_addr; + struct ether_addr mac; + uint16_t mtu; + + struct vhost_device_ops const *notify_ops; + + uint32_t nr_guest_pages; + uint32_t max_guest_pages; + struct guest_page *guest_pages; + int has_new_mem_table; + struct VhostUserMemory mem_table; + int mem_table_fds[VHOST_MEMORY_MAX_NREGIONS]; +} __rte_cache_aligned; + + +#define VHOST_LOG_PAGE 4096 + +static inline void __attribute__((always_inline)) +vhost_log_page(uint8_t *log_base, uint64_t page) +{ + log_base[page / 8] |= 1 << (page % 8); +} + +static inline void __attribute__((always_inline)) +vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len) +{ + uint64_t page; + + if (likely(((dev->negotiated_features & (1ULL << VHOST_F_LOG_ALL)) == 0) || + !dev->log_base || !len)) + return; + + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8))) + return; + + /* To make sure guest memory updates are committed before logging */ + rte_smp_wmb(); + + page = addr / VHOST_LOG_PAGE; + while (page * VHOST_LOG_PAGE < addr + len) { + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page); + page += 1; + } +} + +static inline void __attribute__((always_inline)) +vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) +{ + vhost_log_write(dev, vq->log_guest_addr + offset, len); +} + +/* Macros for printing using RTE_LOG */ +#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1 +#define RTE_LOGTYPE_VHOST_DATA RTE_LOGTYPE_USER1 + +#ifdef RTE_LIBRTE_VHOST_DEBUG +#define VHOST_MAX_PRINT_BUFF 6072 +#define VHOST_LOG_LEVEL RTE_LOG_DEBUG +#define VHOST_LOG_DEBUG(log_type, fmt, args...) RTE_LOG(DEBUG, log_type, fmt, ##args) +#define PRINT_PACKET(device, addr, size, header) do { \ + char *pkt_addr = (char *)(addr); \ + unsigned int index; \ + char packet[VHOST_MAX_PRINT_BUFF]; \ + \ + if ((header)) \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Header size %d: ", (device->vid), (size)); \ + else \ + snprintf(packet, VHOST_MAX_PRINT_BUFF, "(%d) Packet size %d: ", (device->vid), (size)); \ + for (index = 0; index < (size); index++) { \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), \ + "%02hhx ", pkt_addr[index]); \ + } \ + snprintf(packet + strnlen(packet, VHOST_MAX_PRINT_BUFF), VHOST_MAX_PRINT_BUFF - strnlen(packet, VHOST_MAX_PRINT_BUFF), "\n"); \ + \ + VHOST_LOG_DEBUG(VHOST_DATA, "%s", packet); \ +} while (0) +#else +#define VHOST_LOG_LEVEL RTE_LOG_INFO +#define VHOST_LOG_DEBUG(log_type, fmt, args...) do {} while (0) +#define PRINT_PACKET(device, addr, size, header) do {} while (0) +#endif + +extern uint64_t VHOST_FEATURES; +#define MAX_VHOST_DEVICE 1024 +extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +/* Convert guest physical address to host physical address */ +static inline phys_addr_t __attribute__((always_inline)) +gpa_to_hpa(struct virtio_net *dev, uint64_t gpa, uint64_t size) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + if (gpa >= page->guest_phys_addr && + gpa + size < page->guest_phys_addr + page->size) { + return gpa - page->guest_phys_addr + + page->host_phys_addr; + } + } + + return 0; +} + +struct virtio_net *get_device(int vid); + +int vhost_new_device(uint64_t features); +void cleanup_device(struct virtio_net *dev, int destroy); +void reset_device(struct virtio_net *dev); +void vhost_destroy_device(int); + +int alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx); + +void vhost_set_ifname(int, const char *if_name, unsigned int if_len); +void vhost_enable_dequeue_zero_copy(int vid); + +struct vhost_device_ops const *vhost_driver_callback_get(const char *path); + +/* + * Backend-specific cleanup. + * + * TODO: fix it; we have one backend now + */ +void vhost_backend_cleanup(struct virtio_net *dev); + +#endif /* _VHOST_NET_CDEV_H_ */ diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.c b/src/spdk/lib/vhost/rte_vhost/vhost_user.c new file mode 100644 index 00000000..b708a8a7 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.c @@ -0,0 +1,1360 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef RTE_LIBRTE_VHOST_NUMA +#include +#endif + +#include +#include +#include + +#include "vhost.h" +#include "vhost_user.h" + +#define VIRTIO_MIN_MTU 68 +#define VIRTIO_MAX_MTU 65535 + +static const char *vhost_message_str[VHOST_USER_MAX] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_NET_SET_MTU] = "VHOST_USER_NET_SET_MTU", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", + [VHOST_USER_NVME_ADMIN] = "VHOST_USER_NVME_ADMIN", + [VHOST_USER_NVME_SET_CQ_CALL] = "VHOST_USER_NVME_SET_CQ_CALL", + [VHOST_USER_NVME_GET_CAP] = "VHOST_USER_NVME_GET_CAP", + [VHOST_USER_NVME_START_STOP] = "VHOST_USER_NVME_START_STOP", + [VHOST_USER_NVME_IO_CMD] = "VHOST_USER_NVME_IO_CMD" +}; + +static uint64_t +get_blk_size(int fd) +{ + struct stat stat; + int ret; + + ret = fstat(fd, &stat); + return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize; +} + +static void +free_mem_region(struct virtio_net *dev) +{ + uint32_t i; + struct rte_vhost_mem_region *reg; + + if (!dev || !dev->mem) + return; + + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + if (reg->host_user_addr) { + munmap(reg->mmap_addr, reg->mmap_size); + close(reg->fd); + } + } +} + +void +vhost_backend_cleanup(struct virtio_net *dev) +{ + uint32_t i; + + if (dev->mem) { + if (dev->has_new_mem_table) { + for (i = 0; i < dev->mem->nregions; i++) { + close(dev->mem_table_fds[i]); + } + dev->has_new_mem_table = 0; + } + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + free(dev->guest_pages); + dev->guest_pages = NULL; + + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + dev->log_addr = 0; + } +} + +/* + * This function just returns success at the moment unless + * the device hasn't been initialised. + */ +static int +vhost_user_set_owner(void) +{ + return 0; +} + +static int +vhost_user_reset_owner(struct virtio_net *dev) +{ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + cleanup_device(dev, 0); + reset_device(dev); + return 0; +} + +/* + * The features that we support are requested. + */ +static uint64_t +vhost_user_get_features(struct virtio_net *dev) +{ + return dev->features; +} + +/* + * We receive the negotiated features supported by us and the virtio device. + */ +static int +vhost_user_set_features(struct virtio_net *dev, uint64_t features) +{ + uint64_t vhost_features = 0; + + vhost_features = vhost_user_get_features(dev); + if (features & ~vhost_features) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) received invalid negotiated features.\n", + dev->vid); + return -1; + } + + if ((dev->flags & VIRTIO_DEV_RUNNING) && dev->negotiated_features != features) { + if (dev->notify_ops->features_changed) { + dev->notify_ops->features_changed(dev->vid, features); + } else { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + + dev->negotiated_features = features; + if (dev->negotiated_features & + ((1 << VIRTIO_NET_F_MRG_RXBUF) | (1ULL << VIRTIO_F_VERSION_1))) { + dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else { + dev->vhost_hlen = sizeof(struct virtio_net_hdr); + } + VHOST_LOG_DEBUG(VHOST_CONFIG, + "(%d) mergeable RX buffers %s, virtio 1 %s\n", + dev->vid, + (dev->negotiated_features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off", + (dev->negotiated_features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off"); + + return 0; +} + +/* + * The virtio device sends us the size of the descriptor ring. + */ +static int +vhost_user_set_vring_num(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + vq->size = msg->payload.state.num; + + if (dev->dequeue_zero_copy) { + vq->nr_zmbuf = 0; + vq->last_zmbuf_idx = 0; + vq->zmbuf_size = vq->size; + vq->zmbufs = rte_zmalloc(NULL, vq->zmbuf_size * + sizeof(struct zcopy_mbuf), 0); + if (vq->zmbufs == NULL) { + RTE_LOG(WARNING, VHOST_CONFIG, + "failed to allocate mem for zero copy; " + "zero copy is force disabled\n"); + dev->dequeue_zero_copy = 0; + } + } + + vq->shadow_used_ring = rte_malloc(NULL, + vq->size * sizeof(struct vring_used_elem), + RTE_CACHE_LINE_SIZE); + if (!vq->shadow_used_ring) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to allocate memory for shadow used ring.\n"); + return -1; + } + + return 0; +} + +/* + * Reallocate virtio_dev and vhost_virtqueue data structure to make them on the + * same numa node as the memory of vring descriptor. + */ +#ifdef RTE_LIBRTE_VHOST_NUMA +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index) +{ + int oldnode, newnode; + struct virtio_net *old_dev; + struct vhost_virtqueue *old_vq, *vq; + int ret; + + old_dev = dev; + vq = old_vq = dev->virtqueue[index]; + + ret = get_mempolicy(&newnode, NULL, 0, old_vq->desc, + MPOL_F_NODE | MPOL_F_ADDR); + + /* check if we need to reallocate vq */ + ret |= get_mempolicy(&oldnode, NULL, 0, old_vq, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get vq numa information.\n"); + return dev; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate vq from %d to %d node\n", oldnode, newnode); + vq = rte_malloc_socket(NULL, sizeof(*vq), 0, newnode); + if (!vq) + return dev; + + memcpy(vq, old_vq, sizeof(*vq)); + rte_free(old_vq); + } + + /* check if we need to reallocate dev */ + ret = get_mempolicy(&oldnode, NULL, 0, old_dev, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret) { + RTE_LOG(ERR, VHOST_CONFIG, + "Unable to get dev numa information.\n"); + goto out; + } + if (oldnode != newnode) { + RTE_LOG(INFO, VHOST_CONFIG, + "reallocate dev from %d to %d node\n", + oldnode, newnode); + dev = rte_malloc_socket(NULL, sizeof(*dev), 0, newnode); + if (!dev) { + dev = old_dev; + goto out; + } + + memcpy(dev, old_dev, sizeof(*dev)); + rte_free(old_dev); + } + +out: + dev->virtqueue[index] = vq; + vhost_devices[dev->vid] = dev; + + return dev; +} +#else +static struct virtio_net* +numa_realloc(struct virtio_net *dev, int index __rte_unused) +{ + return dev; +} +#endif + +/* + * Converts QEMU virtual address to Vhost virtual address. This function is + * used to convert the ring addresses to our address space. + */ +static uint64_t +qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len) +{ + struct rte_vhost_mem_region *reg; + uint32_t i; + + /* Find the region where the address lives. */ + for (i = 0; i < dev->mem->nregions; i++) { + reg = &dev->mem->regions[i]; + + if (qva >= reg->guest_user_addr && + qva < reg->guest_user_addr + reg->size) { + + if (unlikely(*len > reg->guest_user_addr + reg->size - qva)) + *len = reg->guest_user_addr + reg->size - qva; + + return qva - reg->guest_user_addr + + reg->host_user_addr; + } + } + + return 0; +} + +static int vhost_setup_mem_table(struct virtio_net *dev); + +/* + * The virtio device sends us the desc, used and avail ring addresses. + * This function then converts these to our address space. + */ +static int +vhost_user_set_vring_addr(struct virtio_net *dev, VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq; + uint64_t len; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + if (dev->has_new_mem_table) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + if (dev->mem == NULL) + return -1; + + /* addr->index refers to the queue index. The txq 1, rxq is 0. */ + vq = dev->virtqueue[msg->payload.addr.index]; + + /* The addresses are converted from QEMU virtual to Vhost virtual. */ + len = sizeof(struct vring_desc) * vq->size; + vq->desc = (struct vring_desc *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.desc_user_addr, &len); + if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to map desc ring.\n", + dev->vid); + return -1; + } + + dev = numa_realloc(dev, msg->payload.addr.index); + vq = dev->virtqueue[msg->payload.addr.index]; + + len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size; + vq->avail = (struct vring_avail *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.avail_user_addr, &len); + if (vq->avail == 0 || + len != sizeof(struct vring_avail) + + sizeof(uint16_t) * vq->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find avail ring address.\n", + dev->vid); + return -1; + } + + len = sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size; + vq->used = (struct vring_used *)(uintptr_t)qva_to_vva(dev, + msg->payload.addr.used_user_addr, &len); + if (vq->used == 0 || len != sizeof(struct vring_used) + + sizeof(struct vring_used_elem) * vq->size) { + + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to find used ring address.\n", + dev->vid); + return -1; + } + + if (vq->last_used_idx != vq->used->idx) { + RTE_LOG(WARNING, VHOST_CONFIG, + "last_used_idx (%u) and vq->used->idx (%u) mismatches; " + "some packets maybe resent for Tx and dropped for Rx\n", + vq->last_used_idx, vq->used->idx); + vq->last_used_idx = vq->used->idx; + vq->last_avail_idx = vq->used->idx; + } + + vq->log_guest_addr = msg->payload.addr.log_guest_addr; + + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address desc: %p\n", + dev->vid, vq->desc); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address avail: %p\n", + dev->vid, vq->avail); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) mapped address used: %p\n", + dev->vid, vq->used); + VHOST_LOG_DEBUG(VHOST_CONFIG, "(%d) log_guest_addr: %" PRIx64 "\n", + dev->vid, vq->log_guest_addr); + + return 0; +} + +/* + * The virtio device sends us the available ring last used index. + */ +static int +vhost_user_set_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->virtqueue[msg->payload.state.index]->last_used_idx = msg->payload.state.num; + dev->virtqueue[msg->payload.state.index]->last_avail_idx = msg->payload.state.num; + + return 0; +} + +static void +add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr, + uint64_t host_phys_addr, uint64_t size) +{ + struct guest_page *page, *last_page; + + if (dev->nr_guest_pages == dev->max_guest_pages) { + dev->max_guest_pages = RTE_MAX(8U, dev->max_guest_pages * 2); + dev->guest_pages = realloc(dev->guest_pages, + dev->max_guest_pages * sizeof(*page)); + } + + if (dev->nr_guest_pages > 0) { + last_page = &dev->guest_pages[dev->nr_guest_pages - 1]; + /* merge if the two pages are continuous */ + if (host_phys_addr == last_page->host_phys_addr + + last_page->size) { + last_page->size += size; + return; + } + } + + page = &dev->guest_pages[dev->nr_guest_pages++]; + page->guest_phys_addr = guest_phys_addr; + page->host_phys_addr = host_phys_addr; + page->size = size; +} + +static void +add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg, + uint64_t page_size) +{ + uint64_t reg_size = reg->size; + uint64_t host_user_addr = reg->host_user_addr; + uint64_t guest_phys_addr = reg->guest_phys_addr; + uint64_t host_phys_addr; + uint64_t size; + + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t)host_user_addr); + size = page_size - (guest_phys_addr & (page_size - 1)); + size = RTE_MIN(size, reg_size); + + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + + while (reg_size > 0) { + size = RTE_MIN(reg_size, page_size); + host_phys_addr = rte_mem_virt2phy((void *)(uintptr_t) + host_user_addr); + add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size); + + host_user_addr += size; + guest_phys_addr += size; + reg_size -= size; + } +} + +#ifdef RTE_LIBRTE_VHOST_DEBUG +/* TODO: enable it only in debug mode? */ +static void +dump_guest_pages(struct virtio_net *dev) +{ + uint32_t i; + struct guest_page *page; + + for (i = 0; i < dev->nr_guest_pages; i++) { + page = &dev->guest_pages[i]; + + RTE_LOG(INFO, VHOST_CONFIG, + "guest physical page region %u\n" + "\t guest_phys_addr: %" PRIx64 "\n" + "\t host_phys_addr : %" PRIx64 "\n" + "\t size : %" PRIx64 "\n", + i, + page->guest_phys_addr, + page->host_phys_addr, + page->size); + } +} +#else +#define dump_guest_pages(dev) +#endif + +static int +vhost_user_set_mem_table(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + uint32_t i; + + if (dev->has_new_mem_table) { + /* + * The previous mem table was not consumed, so close the + * file descriptors from that mem table before copying + * the new one. + */ + for (i = 0; i < dev->mem_table.nregions; i++) { + close(dev->mem_table_fds[i]); + } + } + + memcpy(&dev->mem_table, &pmsg->payload.memory, sizeof(dev->mem_table)); + memcpy(dev->mem_table_fds, pmsg->fds, sizeof(dev->mem_table_fds)); + dev->has_new_mem_table = 1; + /* vhost-user-nvme will not send + * set vring addr message, enable + * memory address table now. + */ + if (dev->has_new_mem_table && dev->is_nvme) { + vhost_setup_mem_table(dev); + dev->has_new_mem_table = 0; + } + + return 0; +} + + static int +vhost_setup_mem_table(struct virtio_net *dev) +{ + struct VhostUserMemory memory = dev->mem_table; + struct rte_vhost_mem_region *reg; + struct vhost_virtqueue *vq; + void *mmap_addr; + uint64_t mmap_size; + uint64_t mmap_offset; + uint64_t alignment; + uint32_t i; + int fd; + + if (dev->mem) { + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + } + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + /* Those addresses won't be valid anymore in host address space + * after setting new mem table. Initiator need to resend these + * addresses. + */ + vq->desc = NULL; + vq->avail = NULL; + vq->used = NULL; + } + + dev->nr_guest_pages = 0; + if (!dev->guest_pages) { + dev->max_guest_pages = 8; + dev->guest_pages = malloc(dev->max_guest_pages * + sizeof(struct guest_page)); + } + + dev->mem = rte_zmalloc("vhost-mem-table", sizeof(struct rte_vhost_memory) + + sizeof(struct rte_vhost_mem_region) * memory.nregions, 0); + if (dev->mem == NULL) { + RTE_LOG(ERR, VHOST_CONFIG, + "(%d) failed to allocate memory for dev->mem\n", + dev->vid); + return -1; + } + dev->mem->nregions = memory.nregions; + + for (i = 0; i < memory.nregions; i++) { + fd = dev->mem_table_fds[i]; + reg = &dev->mem->regions[i]; + + reg->guest_phys_addr = memory.regions[i].guest_phys_addr; + reg->guest_user_addr = memory.regions[i].userspace_addr; + reg->size = memory.regions[i].memory_size; + reg->fd = fd; + + mmap_offset = memory.regions[i].mmap_offset; + mmap_size = reg->size + mmap_offset; + + /* mmap() without flag of MAP_ANONYMOUS, should be called + * with length argument aligned with hugepagesz at older + * longterm version Linux, like 2.6.32 and 3.2.72, or + * mmap() will fail with EINVAL. + * + * to avoid failure, make sure in caller to keep length + * aligned. + */ + alignment = get_blk_size(fd); + if (alignment == (uint64_t)-1) { + RTE_LOG(ERR, VHOST_CONFIG, + "couldn't get hugepage size through fstat\n"); + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); + + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + + if (mmap_addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, + "mmap region %u failed.\n", i); + goto err_mmap; + } + + if (madvise(mmap_addr, mmap_size, MADV_DONTDUMP) != 0) { + RTE_LOG(INFO, VHOST_CONFIG, + "MADV_DONTDUMP advice setting failed.\n"); + } + + reg->mmap_addr = mmap_addr; + reg->mmap_size = mmap_size; + reg->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + + mmap_offset; + + if (dev->dequeue_zero_copy) + add_guest_pages(dev, reg, alignment); + + RTE_LOG(INFO, VHOST_CONFIG, + "guest memory region %u, size: 0x%" PRIx64 "\n" + "\t guest physical addr: 0x%" PRIx64 "\n" + "\t guest virtual addr: 0x%" PRIx64 "\n" + "\t host virtual addr: 0x%" PRIx64 "\n" + "\t mmap addr : 0x%" PRIx64 "\n" + "\t mmap size : 0x%" PRIx64 "\n" + "\t mmap align: 0x%" PRIx64 "\n" + "\t mmap off : 0x%" PRIx64 "\n", + i, reg->size, + reg->guest_phys_addr, + reg->guest_user_addr, + reg->host_user_addr, + (uint64_t)(uintptr_t)mmap_addr, + mmap_size, + alignment, + mmap_offset); + } + + dump_guest_pages(dev); + + return 0; + +err_mmap: + free_mem_region(dev); + rte_free(dev->mem); + dev->mem = NULL; + return -1; +} + +static int +vq_is_ready(struct vhost_virtqueue *vq) +{ + return vq && vq->desc && + vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD && + vq->kickfd != VIRTIO_INVALID_EVENTFD && + vq->callfd != VIRTIO_INVALID_EVENTFD; +} + +static int +virtio_is_ready(struct virtio_net *dev) +{ + struct vhost_virtqueue *vq; + uint32_t i; + + if (dev->nr_vring == 0) + return 0; + + for (i = 0; i < dev->nr_vring; i++) { + vq = dev->virtqueue[i]; + + if (vq_is_ready(vq)) { + RTE_LOG(INFO, VHOST_CONFIG, + "virtio is now ready for processing.\n"); + return 1; + } + } + + return 0; +} + +static void +vhost_user_set_vring_call(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring call idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = file.fd; +} + +static void +vhost_user_set_vring_kick(struct virtio_net *dev, struct VhostUserMsg *pmsg) +{ + struct vhost_vring_file file; + struct vhost_virtqueue *vq; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + file.index = pmsg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + if (pmsg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) + file.fd = VIRTIO_INVALID_EVENTFD; + else + file.fd = pmsg->fds[0]; + RTE_LOG(INFO, VHOST_CONFIG, + "vring kick idx:%d file:%d\n", file.index, file.fd); + + vq = dev->virtqueue[file.index]; + if (vq->kickfd >= 0) + close(vq->kickfd); + vq->kickfd = file.fd; +} + +static void +free_zmbufs(struct vhost_virtqueue *vq) +{ + struct zcopy_mbuf *zmbuf, *next; + + for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list); + zmbuf != NULL; zmbuf = next) { + next = TAILQ_NEXT(zmbuf, next); + + rte_pktmbuf_free(zmbuf->mbuf); + TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next); + } + + rte_free(vq->zmbufs); +} + +/* + * when virtio is stopped, qemu will send us the GET_VRING_BASE message. + */ +static int +vhost_user_get_vring_base(struct virtio_net *dev, + VhostUserMsg *msg) +{ + struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index]; + + /* We have to stop the queue (virtio) if it is running. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->flags &= ~VIRTIO_DEV_READY; + + /* Here we are safe to get the last used index */ + msg->payload.state.num = vq->last_used_idx; + + RTE_LOG(INFO, VHOST_CONFIG, + "vring base idx:%d file:%d\n", msg->payload.state.index, msg->payload.state.num); + /* + * Based on current qemu vhost-user implementation, this message is + * sent and only sent in vhost_vring_stop. + * TODO: cleanup the vring, it isn't usable since here. + */ + if (vq->kickfd >= 0) + close(vq->kickfd); + + vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (vq->callfd >= 0) + close(vq->callfd); + + vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD; + + if (dev->dequeue_zero_copy) + free_zmbufs(vq); + rte_free(vq->shadow_used_ring); + vq->shadow_used_ring = NULL; + + return 0; +} + +/* + * when virtio queues are ready to work, qemu will send us to + * enable the virtio queue pair. + */ +static int +vhost_user_set_vring_enable(struct virtio_net *dev, + VhostUserMsg *msg) +{ + int enable = (int)msg->payload.state.num; + + RTE_LOG(INFO, VHOST_CONFIG, + "set queue enable: %d to qp idx: %d\n", + enable, msg->payload.state.index); + + if (dev->notify_ops->vring_state_changed) + dev->notify_ops->vring_state_changed(dev->vid, msg->payload.state.index, enable); + + dev->virtqueue[msg->payload.state.index]->enabled = enable; + + return 0; +} + +static void +vhost_user_set_protocol_features(struct virtio_net *dev, + uint64_t protocol_features) +{ + if (protocol_features & ~VHOST_USER_PROTOCOL_FEATURES) + return; + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + dev->protocol_features = protocol_features; +} + +static int +vhost_user_set_log_base(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + int fd = msg->fds[0]; + uint64_t size, off; + void *addr; + + if (fd < 0) { + RTE_LOG(ERR, VHOST_CONFIG, "invalid log fd: %d\n", fd); + return -1; + } + + if (msg->size != sizeof(VhostUserLog)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid log base msg size: %"PRId32" != %d\n", + msg->size, (int)sizeof(VhostUserLog)); + return -1; + } + + /* Remove from the data plane. */ + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + RTE_LOG(INFO, VHOST_CONFIG, + "log mmap size: %"PRId64", offset: %"PRId64"\n", + size, off); + + /* + * mmap from 0 to workaround a hugepage mmap bug: mmap will + * fail when offset is not page size aligned. + */ + addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, VHOST_CONFIG, "mmap log base failed!\n"); + return -1; + } + + /* + * Free previously mapped log memory on occasionally + * multiple VHOST_USER_SET_LOG_BASE. + */ + if (dev->log_addr) { + munmap((void *)(uintptr_t)dev->log_addr, dev->log_size); + } + dev->log_addr = (uint64_t)(uintptr_t)addr; + dev->log_base = dev->log_addr + off; + dev->log_size = size; + + return 0; +} + +/* + * An rarp packet is constructed and broadcasted to notify switches about + * the new location of the migrated VM, so that packets from outside will + * not be lost after migration. + * + * However, we don't actually "send" a rarp packet here, instead, we set + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. + */ +static int +vhost_user_send_rarp(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + uint8_t *mac = (uint8_t *)&msg->payload.u64; + + RTE_LOG(DEBUG, VHOST_CONFIG, + ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); + memcpy(dev->mac.addr_bytes, mac, 6); + + /* + * Set the flag to inject a RARP broadcast packet at + * rte_vhost_dequeue_burst(). + * + * rte_smp_wmb() is for making sure the mac is copied + * before the flag is set. + */ + rte_smp_wmb(); + rte_atomic16_set(&dev->broadcast_rarp, 1); + + return 0; +} + +static int +vhost_user_net_set_mtu(struct virtio_net *dev, struct VhostUserMsg *msg) +{ + if (msg->payload.u64 < VIRTIO_MIN_MTU || + msg->payload.u64 > VIRTIO_MAX_MTU) { + RTE_LOG(ERR, VHOST_CONFIG, "Invalid MTU size (%"PRIu64")\n", + msg->payload.u64); + + return -1; + } + + dev->mtu = msg->payload.u64; + + return 0; +} + +/* return bytes# of read on success or negative val on failure. */ +static int +read_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS); + if (ret <= 0) + return ret; + + if (msg && msg->size) { + if (msg->size > sizeof(msg->payload)) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid msg size: %d\n", msg->size); + return -1; + } + ret = read(sockfd, &msg->payload, msg->size); + if (ret <= 0) + return ret; + if (ret != (int)msg->size) { + RTE_LOG(ERR, VHOST_CONFIG, + "read control message failed\n"); + return -1; + } + } + + return ret; +} + +static int +send_vhost_message(int sockfd, struct VhostUserMsg *msg) +{ + int ret; + + if (!msg) + return 0; + + msg->flags &= ~VHOST_USER_VERSION_MASK; + msg->flags &= ~VHOST_USER_NEED_REPLY; + msg->flags |= VHOST_USER_VERSION; + msg->flags |= VHOST_USER_REPLY_MASK; + + ret = send_fd_message(sockfd, (char *)msg, + VHOST_USER_HDR_SIZE + msg->size, NULL, 0); + + return ret; +} + +/* + * Allocate a queue pair if it hasn't been allocated yet + */ +static int +vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg) +{ + uint16_t vring_idx; + + switch (msg->request) { + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK; + break; + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + vring_idx = msg->payload.state.index; + break; + case VHOST_USER_SET_VRING_ADDR: + vring_idx = msg->payload.addr.index; + break; + default: + return 0; + } + + if (vring_idx >= VHOST_MAX_VRING) { + RTE_LOG(ERR, VHOST_CONFIG, + "invalid vring index: %u\n", vring_idx); + return -1; + } + + if (dev->virtqueue[vring_idx]) + return 0; + + return alloc_vring_queue(dev, vring_idx); +} + +static int +vhost_user_nvme_io_request_passthrough(struct virtio_net *dev, + uint16_t qid, uint16_t tail_head, + bool is_submission_queue) +{ + return -1; +} + +static int +vhost_user_nvme_admin_passthrough(struct virtio_net *dev, + void *cmd, void *cqe, void *buf) +{ + if (dev->notify_ops->vhost_nvme_admin_passthrough) { + return dev->notify_ops->vhost_nvme_admin_passthrough(dev->vid, cmd, cqe, buf); + } + + return -1; +} + +static int +vhost_user_nvme_set_cq_call(struct virtio_net *dev, uint16_t qid, int fd) +{ + if (dev->notify_ops->vhost_nvme_set_cq_call) { + return dev->notify_ops->vhost_nvme_set_cq_call(dev->vid, qid, fd); + } + + return -1; +} + +static int +vhost_user_nvme_get_cap(struct virtio_net *dev, uint64_t *cap) +{ + if (dev->notify_ops->vhost_nvme_get_cap) { + return dev->notify_ops->vhost_nvme_get_cap(dev->vid, cap); + } + + return -1; +} + +int +vhost_user_msg_handler(int vid, int fd) +{ + struct virtio_net *dev; + struct VhostUserMsg msg; + struct vhost_vring_file file; + int ret; + uint64_t cap; + uint64_t enable; + uint8_t cqe[16]; + uint8_t cmd[64]; + uint8_t buf[4096]; + uint16_t qid, tail_head; + bool is_submission_queue; + + dev = get_device(vid); + if (dev == NULL) + return -1; + + if (!dev->notify_ops) { + dev->notify_ops = vhost_driver_callback_get(dev->ifname); + if (!dev->notify_ops) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to get callback ops for driver %s\n", + dev->ifname); + return -1; + } + } + + ret = read_vhost_message(fd, &msg); + if (ret <= 0 || msg.request >= VHOST_USER_MAX) { + if (ret < 0) + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read message failed\n"); + else if (ret == 0) + RTE_LOG(INFO, VHOST_CONFIG, + "vhost peer closed\n"); + else + RTE_LOG(ERR, VHOST_CONFIG, + "vhost read incorrect message\n"); + + return -1; + } + + RTE_LOG(INFO, VHOST_CONFIG, "%s: read message %s\n", + dev->ifname, vhost_message_str[msg.request]); + + ret = vhost_user_check_and_alloc_queue_pair(dev, &msg); + if (ret < 0) { + RTE_LOG(ERR, VHOST_CONFIG, + "failed to alloc queue\n"); + return -1; + } + + switch (msg.request) { + case VHOST_USER_GET_CONFIG: + if (dev->notify_ops->get_config(dev->vid, + msg.payload.config.region, + msg.payload.config.size) != 0) { + msg.size = sizeof(uint64_t); + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_CONFIG: + if ((dev->notify_ops->set_config(dev->vid, + msg.payload.config.region, + msg.payload.config.offset, + msg.payload.config.size, + msg.payload.config.flags)) != 0) { + ret = 1; + } else { + ret = 0; + } + break; + case VHOST_USER_NVME_ADMIN: + if (!dev->is_nvme) { + dev->is_nvme = 1; + } + memcpy(cmd, msg.payload.nvme.cmd.req, sizeof(cmd)); + ret = vhost_user_nvme_admin_passthrough(dev, cmd, cqe, buf); + memcpy(msg.payload.nvme.cmd.cqe, cqe, sizeof(cqe)); + msg.size = sizeof(cqe); + /* NVMe Identify Command */ + if (cmd[0] == 0x06) { + memcpy(msg.payload.nvme.buf, &buf, 4096); + msg.size += 4096; + } + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_SET_CQ_CALL: + file.index = msg.payload.u64 & VHOST_USER_VRING_IDX_MASK; + file.fd = msg.fds[0]; + ret = vhost_user_nvme_set_cq_call(dev, file.index, file.fd); + break; + case VHOST_USER_NVME_GET_CAP: + ret = vhost_user_nvme_get_cap(dev, &cap); + if (!ret) + msg.payload.u64 = cap; + else + msg.payload.u64 = 0; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_NVME_START_STOP: + enable = msg.payload.u64; + /* device must be started before set cq call */ + if (enable) { + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } else { + if (dev->flags & VIRTIO_DEV_RUNNING) { + dev->flags &= ~VIRTIO_DEV_RUNNING; + dev->notify_ops->destroy_device(dev->vid); + } + } + break; + case VHOST_USER_NVME_IO_CMD: + qid = msg.payload.nvme_io.qid; + tail_head = msg.payload.nvme_io.tail_head; + is_submission_queue = (msg.payload.nvme_io.queue_type == VHOST_USER_NVME_SUBMISSION_QUEUE) ? true : false; + vhost_user_nvme_io_request_passthrough(dev, qid, tail_head, is_submission_queue); + break; + case VHOST_USER_GET_FEATURES: + msg.payload.u64 = vhost_user_get_features(dev); + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_FEATURES: + vhost_user_set_features(dev, msg.payload.u64); + break; + + case VHOST_USER_GET_PROTOCOL_FEATURES: + msg.payload.u64 = VHOST_USER_PROTOCOL_FEATURES; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_PROTOCOL_FEATURES: + vhost_user_set_protocol_features(dev, msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + vhost_user_set_owner(); + break; + case VHOST_USER_RESET_OWNER: + vhost_user_reset_owner(dev); + break; + + case VHOST_USER_SET_MEM_TABLE: + ret = vhost_user_set_mem_table(dev, &msg); + break; + + case VHOST_USER_SET_LOG_BASE: + vhost_user_set_log_base(dev, &msg); + + /* it needs a reply */ + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + case VHOST_USER_SET_LOG_FD: + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented.\n"); + break; + + case VHOST_USER_SET_VRING_NUM: + vhost_user_set_vring_num(dev, &msg); + break; + case VHOST_USER_SET_VRING_ADDR: + vhost_user_set_vring_addr(dev, &msg); + break; + case VHOST_USER_SET_VRING_BASE: + vhost_user_set_vring_base(dev, &msg); + break; + + case VHOST_USER_GET_VRING_BASE: + vhost_user_get_vring_base(dev, &msg); + msg.size = sizeof(msg.payload.state); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_KICK: + vhost_user_set_vring_kick(dev, &msg); + break; + case VHOST_USER_SET_VRING_CALL: + vhost_user_set_vring_call(dev, &msg); + break; + + case VHOST_USER_SET_VRING_ERR: + if (!(msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)) + close(msg.fds[0]); + RTE_LOG(INFO, VHOST_CONFIG, "not implemented\n"); + break; + + case VHOST_USER_GET_QUEUE_NUM: + msg.payload.u64 = VHOST_MAX_QUEUE_PAIRS; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + break; + + case VHOST_USER_SET_VRING_ENABLE: + vhost_user_set_vring_enable(dev, &msg); + break; + case VHOST_USER_SEND_RARP: + vhost_user_send_rarp(dev, &msg); + break; + + case VHOST_USER_NET_SET_MTU: + ret = vhost_user_net_set_mtu(dev, &msg); + break; + + default: + ret = -1; + break; + + } + + if (msg.flags & VHOST_USER_NEED_REPLY) { + msg.payload.u64 = !!ret; + msg.size = sizeof(msg.payload.u64); + send_vhost_message(fd, &msg); + } + + if (!(dev->flags & VIRTIO_DEV_RUNNING) && virtio_is_ready(dev)) { + dev->flags |= VIRTIO_DEV_READY; + + if (!(dev->flags & VIRTIO_DEV_RUNNING)) { + if (dev->dequeue_zero_copy) { + RTE_LOG(INFO, VHOST_CONFIG, + "dequeue zero copy is enabled\n"); + } + + if (dev->notify_ops->new_device(dev->vid) == 0) + dev->flags |= VIRTIO_DEV_RUNNING; + } + } + + return 0; +} diff --git a/src/spdk/lib/vhost/rte_vhost/vhost_user.h b/src/spdk/lib/vhost/rte_vhost/vhost_user.h new file mode 100644 index 00000000..cb5ff0a6 --- /dev/null +++ b/src/spdk/lib/vhost/rte_vhost/vhost_user.h @@ -0,0 +1,182 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include +#include + +#include "rte_vhost.h" + +/* refer to hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +/* + * Maximum size of virtio device config space + */ +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 +#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3 +#define VHOST_USER_PROTOCOL_F_NET_MTU 4 +#define VHOST_USER_PROTOCOL_F_CONFIG 9 + +#define VHOST_USER_PROTOCOL_FEATURES ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\ + (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \ + (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK) | \ + (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_NET_SET_MTU = 20, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_NVME_ADMIN = 80, + VHOST_USER_NVME_SET_CQ_CALL = 81, + VHOST_USER_NVME_GET_CAP = 82, + VHOST_USER_NVME_START_STOP = 83, + VHOST_USER_NVME_IO_CMD = 84, + VHOST_USER_MAX +} VhostUserRequest; + +typedef enum VhostUserSlaveRequest { + VHOST_USER_SLAVE_NONE = 0, + VHOST_USER_SLAVE_IOTLB_MSG = 1, + VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2, + VHOST_USER_SLAVE_MAX +} VhostUserSlaveRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + +typedef struct VhostUserConfig { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +} VhostUserConfig; + +enum VhostUserNvmeQueueTypes { + VHOST_USER_NVME_SUBMISSION_QUEUE = 1, + VHOST_USER_NVME_COMPLETION_QUEUE = 2, +}; + +typedef struct VhostUserNvmeIO { + enum VhostUserNvmeQueueTypes queue_type; + uint32_t qid; + uint32_t tail_head; +} VhostUserNvmeIO; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) +#define VHOST_USER_NEED_REPLY (0x1 << 3) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + VhostUserConfig config; + struct nvme { + union { + uint8_t req[64]; + uint8_t cqe[16]; + } cmd; + uint8_t buf[4096]; + } nvme; + struct VhostUserNvmeIO nvme_io; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)) VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + + +/* vhost_user.c */ +int vhost_user_msg_handler(int vid, int fd); + +/* socket.c */ +int read_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); +int send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num); + +#endif diff --git a/src/spdk/lib/vhost/vhost.c b/src/spdk/lib/vhost/vhost.c new file mode 100644 index 00000000..0cacf613 --- /dev/null +++ b/src/spdk/lib/vhost/vhost.c @@ -0,0 +1,1503 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/barrier.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +static uint32_t *g_num_ctrlrs; + +/* Path to folder where character device will be created. Can be set by user. */ +static char dev_dirname[PATH_MAX] = ""; + +struct spdk_vhost_dev_event_ctx { + /** Pointer to the controller obtained before enqueuing the event */ + struct spdk_vhost_dev *vdev; + + /** ID of the vdev to send event to. */ + unsigned vdev_id; + + /** User callback function to be executed on given lcore. */ + spdk_vhost_event_fn cb_fn; + + /** Semaphore used to signal that event is done. */ + sem_t sem; + + /** Response to be written by enqueued event. */ + int response; +}; + +static int new_connection(int vid); +static int start_device(int vid); +static void stop_device(int vid); +static void destroy_connection(int vid); +static int get_config(int vid, uint8_t *config, uint32_t len); +static int set_config(int vid, uint8_t *config, uint32_t offset, + uint32_t size, uint32_t flags); + +const struct vhost_device_ops g_spdk_vhost_ops = { + .new_device = start_device, + .destroy_device = stop_device, + .get_config = get_config, + .set_config = set_config, + .new_connection = new_connection, + .destroy_connection = destroy_connection, + .vhost_nvme_admin_passthrough = spdk_vhost_nvme_admin_passthrough, + .vhost_nvme_set_cq_call = spdk_vhost_nvme_set_cq_call, + .vhost_nvme_get_cap = spdk_vhost_nvme_get_cap, +}; + +static TAILQ_HEAD(, spdk_vhost_dev) g_spdk_vhost_devices = TAILQ_HEAD_INITIALIZER( + g_spdk_vhost_devices); +static pthread_mutex_t g_spdk_vhost_mutex = PTHREAD_MUTEX_INITIALIZER; + +void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len) +{ + void *vva; + uint64_t newlen; + + newlen = len; + vva = (void *)rte_vhost_va_from_guest_pa(vdev->mem, addr, &newlen); + if (newlen != len) { + return NULL; + } + + return vva; + +} + +static void +spdk_vhost_log_req_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_id) +{ + struct vring_desc *desc, *desc_table; + uint32_t desc_table_size; + int rc; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + rc = spdk_vhost_vq_get_desc(vdev, virtqueue, req_id, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("Can't log used ring descriptors!\n"); + return; + } + + do { + if (spdk_vhost_vring_desc_is_wr(desc)) { + /* To be honest, only pages realy touched should be logged, but + * doing so would require tracking those changes in each backed. + * Also backend most likely will touch all/most of those pages so + * for lets assume we touched all pages passed to as writeable buffers. */ + rte_vhost_log_write(vdev->vid, desc->addr, desc->len); + } + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + } while (desc); +} + +static void +spdk_vhost_log_used_vring_elem(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t idx) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, ring[idx]); + len = sizeof(virtqueue->vring.used->ring[idx]); + vq_idx = virtqueue - vdev->virtqueue; + + rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len); +} + +static void +spdk_vhost_log_used_vring_idx(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue) +{ + uint64_t offset, len; + uint16_t vq_idx; + + if (spdk_likely(!spdk_vhost_dev_has_feature(vdev, VHOST_F_LOG_ALL))) { + return; + } + + offset = offsetof(struct vring_used, idx); + len = sizeof(virtqueue->vring.used->idx); + vq_idx = virtqueue - vdev->virtqueue; + + rte_vhost_log_used_vring(vdev->vid, vq_idx, offset, len); +} + +/* + * Get available requests from avail ring. + */ +uint16_t +spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *virtqueue, uint16_t *reqs, + uint16_t reqs_len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_avail *avail = vring->avail; + uint16_t size_mask = vring->size - 1; + uint16_t last_idx = vring->last_avail_idx, avail_idx = avail->idx; + uint16_t count, i; + + count = avail_idx - last_idx; + if (spdk_likely(count == 0)) { + return 0; + } + + if (spdk_unlikely(count > vring->size)) { + /* TODO: the queue is unrecoverably broken and should be marked so. + * For now we will fail silently and report there are no new avail entries. + */ + return 0; + } + + count = spdk_min(count, reqs_len); + vring->last_avail_idx += count; + for (i = 0; i < count; i++) { + reqs[i] = vring->avail->ring[(last_idx + i) & size_mask]; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "AVAIL: last_idx=%"PRIu16" avail_idx=%"PRIu16" count=%"PRIu16"\n", + last_idx, avail_idx, count); + + return count; +} + +static bool +spdk_vhost_vring_desc_is_indirect(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_INDIRECT); +} + +int +spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size) +{ + if (spdk_unlikely(req_idx >= virtqueue->vring.size)) { + return -1; + } + + *desc = &virtqueue->vring.desc[req_idx]; + + if (spdk_vhost_vring_desc_is_indirect(*desc)) { + assert(spdk_vhost_dev_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)); + *desc_table_size = (*desc)->len / sizeof(**desc); + *desc_table = spdk_vhost_gpa_to_vva(vdev, (*desc)->addr, + sizeof(**desc) * *desc_table_size); + *desc = *desc_table; + if (*desc == NULL) { + return -1; + } + + return 0; + } + + *desc_table = virtqueue->vring.desc; + *desc_table_size = virtqueue->vring.size; + + return 0; +} + +int +spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue) +{ + if (virtqueue->used_req_cnt == 0) { + return 0; + } + + virtqueue->req_cnt += virtqueue->used_req_cnt; + virtqueue->used_req_cnt = 0; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: sending IRQ: last used %"PRIu16"\n", + virtqueue - vdev->virtqueue, virtqueue->vring.last_used_idx); + + eventfd_write(virtqueue->vring.callfd, (eventfd_t)1); + return 1; +} + + +static void +check_dev_io_stats(struct spdk_vhost_dev *vdev, uint64_t now) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint32_t irq_delay_base = vdev->coalescing_delay_time_base; + uint32_t io_threshold = vdev->coalescing_io_rate_threshold; + int32_t irq_delay; + uint32_t req_cnt; + uint16_t q_idx; + + if (now < vdev->next_stats_check_time) { + return; + } + + vdev->next_stats_check_time = now + vdev->stats_check_interval; + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + req_cnt = virtqueue->req_cnt + virtqueue->used_req_cnt; + if (req_cnt <= io_threshold) { + continue; + } + + irq_delay = (irq_delay_base * (req_cnt - io_threshold)) / io_threshold; + virtqueue->irq_delay_time = (uint32_t) spdk_max(0, irq_delay); + + virtqueue->req_cnt = 0; + virtqueue->next_event_time = now; + } +} + +void +spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_virtqueue *virtqueue; + uint64_t now; + uint16_t q_idx; + + if (vdev->coalescing_delay_time_base == 0) { + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + if (virtqueue->vring.desc == NULL || + (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + continue; + } + + spdk_vhost_vq_used_signal(vdev, virtqueue); + } + } else { + now = spdk_get_ticks(); + check_dev_io_stats(vdev, now); + + for (q_idx = 0; q_idx < vdev->max_queues; q_idx++) { + virtqueue = &vdev->virtqueue[q_idx]; + + /* No need for event right now */ + if (now < virtqueue->next_event_time || + (virtqueue->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + continue; + } + + if (!spdk_vhost_vq_used_signal(vdev, virtqueue)) { + continue; + } + + /* Syscall is quite long so update time */ + now = spdk_get_ticks(); + virtqueue->next_event_time = now + virtqueue->irq_delay_time; + } + } +} + +int +spdk_vhost_set_coalescing(struct spdk_vhost_dev *vdev, uint32_t delay_base_us, + uint32_t iops_threshold) +{ + uint64_t delay_time_base = delay_base_us * spdk_get_ticks_hz() / 1000000ULL; + uint32_t io_rate = iops_threshold * SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS / 1000U; + + if (delay_time_base >= UINT32_MAX) { + SPDK_ERRLOG("Delay time of %"PRIu32" is to big\n", delay_base_us); + return -EINVAL; + } else if (io_rate == 0) { + SPDK_ERRLOG("IOPS rate of %"PRIu32" is too low. Min is %u\n", io_rate, + 1000U / SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS); + return -EINVAL; + } + + vdev->coalescing_delay_time_base = delay_time_base; + vdev->coalescing_io_rate_threshold = io_rate; + + vdev->coalescing_delay_us = delay_base_us; + vdev->coalescing_iops_threshold = iops_threshold; + return 0; +} + +void +spdk_vhost_get_coalescing(struct spdk_vhost_dev *vdev, uint32_t *delay_base_us, + uint32_t *iops_threshold) +{ + if (delay_base_us) { + *delay_base_us = vdev->coalescing_delay_us; + } + + if (iops_threshold) { + *iops_threshold = vdev->coalescing_iops_threshold; + } +} + +/* + * Enqueue id and len to used ring. + */ +void +spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *virtqueue, + uint16_t id, uint32_t len) +{ + struct rte_vhost_vring *vring = &virtqueue->vring; + struct vring_used *used = vring->used; + uint16_t last_idx = vring->last_used_idx & (vring->size - 1); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RING, + "Queue %td - USED RING: last_idx=%"PRIu16" req id=%"PRIu16" len=%"PRIu32"\n", + virtqueue - vdev->virtqueue, vring->last_used_idx, id, len); + + spdk_vhost_log_req_desc(vdev, virtqueue, id); + + vring->last_used_idx++; + used->ring[last_idx].id = id; + used->ring[last_idx].len = len; + + /* Ensure the used ring is updated before we log it or increment used->idx. */ + spdk_smp_wmb(); + + spdk_vhost_log_used_vring_elem(vdev, virtqueue, last_idx); + * (volatile uint16_t *) &used->idx = vring->last_used_idx; + spdk_vhost_log_used_vring_idx(vdev, virtqueue); + + /* Ensure all our used ring changes are visible to the guest at the time + * of interrupt. + * TODO: this is currently an sfence on x86. For other architectures we + * will most likely need an smp_mb(), but smp_mb() is an overkill for x86. + */ + spdk_wmb(); + + virtqueue->used_req_cnt++; +} + +int +spdk_vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size) +{ + struct vring_desc *old_desc = *desc; + uint16_t next_idx; + + if ((old_desc->flags & VRING_DESC_F_NEXT) == 0) { + *desc = NULL; + return 0; + } + + next_idx = old_desc->next; + if (spdk_unlikely(next_idx >= desc_table_size)) { + *desc = NULL; + return -1; + } + + *desc = &desc_table[next_idx]; + return 0; +} + +bool +spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc) +{ + return !!(cur_desc->flags & VRING_DESC_F_WRITE); +} + +#define _2MB_OFFSET(ptr) ((ptr) & (0x200000 - 1)) + +int +spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc) +{ + uint32_t remaining = desc->len; + uint32_t to_boundary; + uint32_t len; + uintptr_t payload = desc->addr; + uintptr_t vva; + + while (remaining) { + if (*iov_index >= SPDK_VHOST_IOVS_MAX) { + SPDK_ERRLOG("SPDK_VHOST_IOVS_MAX(%d) reached\n", SPDK_VHOST_IOVS_MAX); + return -1; + } + vva = (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload); + if (vva == 0) { + SPDK_ERRLOG("gpa_to_vva(%p) == NULL\n", (void *)payload); + return -1; + } + to_boundary = 0x200000 - _2MB_OFFSET(payload); + if (spdk_likely(remaining <= to_boundary)) { + len = remaining; + } else { + /* + * Descriptor crosses a 2MB hugepage boundary. vhost memory regions are allocated + * from hugepage memory, so this means this descriptor may be described by + * discontiguous vhost memory regions. Do not blindly split on the 2MB boundary, + * only split it if the two sides of the boundary do not map to the same vhost + * memory region. This helps ensure we do not exceed the max number of IOVs + * defined by SPDK_VHOST_IOVS_MAX. + */ + len = to_boundary; + while (len < remaining) { + if (vva + len != (uintptr_t)rte_vhost_gpa_to_vva(vdev->mem, payload + len)) { + break; + } + len += spdk_min(remaining - len, 0x200000); + } + } + iov[*iov_index].iov_base = (void *)vva; + iov[*iov_index].iov_len = len; + remaining -= len; + payload += len; + (*iov_index)++; + } + + return 0; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_find_by_id(unsigned id) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->id == id) { + return vdev; + } + } + + return NULL; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_find_by_vid(int vid) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->vid == vid) { + return vdev; + } + } + + return NULL; +} + +#define SHIFT_2MB 21 +#define SIZE_2MB (1ULL << SHIFT_2MB) +#define FLOOR_2MB(x) (((uintptr_t)x) / SIZE_2MB) << SHIFT_2MB +#define CEIL_2MB(x) ((((uintptr_t)x) + SIZE_2MB - 1) / SIZE_2MB) << SHIFT_2MB + +static void +spdk_vhost_dev_mem_register(struct spdk_vhost_dev *vdev) +{ + struct rte_vhost_mem_region *region; + uint32_t i; + + for (i = 0; i < vdev->mem->nregions; i++) { + uint64_t start, end, len; + region = &vdev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + SPDK_INFOLOG(SPDK_LOG_VHOST, "Registering VM memory for vtophys translation - 0x%jx len:0x%jx\n", + start, len); + + if (spdk_mem_register((void *)start, len) != 0) { + SPDK_WARNLOG("Failed to register memory region %"PRIu32". Future vtophys translation might fail.\n", + i); + continue; + } + } +} + +static void +spdk_vhost_dev_mem_unregister(struct spdk_vhost_dev *vdev) +{ + struct rte_vhost_mem_region *region; + uint32_t i; + + for (i = 0; i < vdev->mem->nregions; i++) { + uint64_t start, end, len; + region = &vdev->mem->regions[i]; + start = FLOOR_2MB(region->mmap_addr); + end = CEIL_2MB(region->mmap_addr + region->mmap_size); + len = end - start; + + if (spdk_vtophys((void *) start) == SPDK_VTOPHYS_ERROR) { + continue; /* region has not been registered */ + } + + if (spdk_mem_unregister((void *)start, len) != 0) { + assert(false); + } + } + +} + +static void +spdk_vhost_free_reactor(uint32_t lcore) +{ + g_num_ctrlrs[lcore]--; +} + +struct spdk_vhost_dev * +spdk_vhost_dev_find(const char *ctrlr_name) +{ + struct spdk_vhost_dev *vdev; + size_t dev_dirname_len = strlen(dev_dirname); + + if (strncmp(ctrlr_name, dev_dirname, dev_dirname_len) == 0) { + ctrlr_name += dev_dirname_len; + } + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (strcmp(vdev->name, ctrlr_name) == 0) { + return vdev; + } + } + + return NULL; +} + +static int +spdk_vhost_parse_core_mask(const char *mask, struct spdk_cpuset *cpumask) +{ + int rc; + + if (cpumask == NULL) { + return -1; + } + + if (mask == NULL) { + spdk_cpuset_copy(cpumask, spdk_app_get_core_mask()); + return 0; + } + + rc = spdk_app_parse_core_mask(mask, cpumask); + if (rc < 0) { + SPDK_ERRLOG("invalid cpumask %s\n", mask); + return -1; + } + + if (spdk_cpuset_count(cpumask) == 0) { + SPDK_ERRLOG("no cpu is selected among reactor mask(=%s)\n", + spdk_cpuset_fmt(spdk_app_get_core_mask())); + return -1; + } + + return 0; +} + +static void * +_start_rte_driver(void *arg) +{ + char *path = arg; + + if (rte_vhost_driver_start(path) != 0) { + return NULL; + } + + return path; +} + +int +spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend) +{ + static unsigned ctrlr_num; + char path[PATH_MAX]; + struct stat file_stat; + struct spdk_cpuset *cpumask; + int rc; + + assert(vdev); + + /* We expect devices inside g_spdk_vhost_devices to be sorted in ascending + * order in regard of vdev->id. For now we always set vdev->id = ctrlr_num++ + * and append each vdev to the very end of g_spdk_vhost_devices list. + * This is required for foreach vhost events to work. + */ + if (ctrlr_num == UINT_MAX) { + assert(false); + return -EINVAL; + } + + if (name == NULL) { + SPDK_ERRLOG("Can't register controller with no name\n"); + return -EINVAL; + } + + cpumask = spdk_cpuset_alloc(); + if (!cpumask) { + SPDK_ERRLOG("spdk_cpuset_alloc failed\n"); + return -ENOMEM; + } + + if (spdk_vhost_parse_core_mask(mask_str, cpumask) != 0) { + SPDK_ERRLOG("cpumask %s is invalid (app mask is 0x%s)\n", + mask_str, spdk_cpuset_fmt(spdk_app_get_core_mask())); + rc = -EINVAL; + goto out; + } + + if (spdk_vhost_dev_find(name)) { + SPDK_ERRLOG("vhost controller %s already exists.\n", name); + rc = -EEXIST; + goto out; + } + + if (snprintf(path, sizeof(path), "%s%s", dev_dirname, name) >= (int)sizeof(path)) { + SPDK_ERRLOG("Resulting socket path for controller %s is too long: %s%s\n", name, dev_dirname, + name); + rc = -EINVAL; + goto out; + } + + /* Register vhost driver to handle vhost messages. */ + if (stat(path, &file_stat) != -1) { + if (!S_ISSOCK(file_stat.st_mode)) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The file already exists and is not a socket.\n", + path); + rc = -EIO; + goto out; + } else if (unlink(path) != 0) { + SPDK_ERRLOG("Cannot create a domain socket at path \"%s\": " + "The socket already exists and failed to unlink.\n", + path); + rc = -EIO; + goto out; + } + } + + if (rte_vhost_driver_register(path, 0) != 0) { + SPDK_ERRLOG("Could not register controller %s with vhost library\n", name); + SPDK_ERRLOG("Check if domain socket %s already exists\n", path); + rc = -EIO; + goto out; + } + if (rte_vhost_driver_set_features(path, backend->virtio_features) || + rte_vhost_driver_disable_features(path, backend->disabled_features)) { + SPDK_ERRLOG("Couldn't set vhost features for controller %s\n", name); + + rte_vhost_driver_unregister(path); + rc = -EIO; + goto out; + } + + if (rte_vhost_driver_callback_register(path, &g_spdk_vhost_ops) != 0) { + rte_vhost_driver_unregister(path); + SPDK_ERRLOG("Couldn't register callbacks for controller %s\n", name); + rc = -EIO; + goto out; + } + + /* The following might start a POSIX thread that polls for incoming + * socket connections and calls backend->start/stop_device. These backend + * callbacks are also protected by the global SPDK vhost mutex, so we're + * safe with not initializing the vdev just yet. + */ + if (spdk_call_unaffinitized(_start_rte_driver, path) == NULL) { + SPDK_ERRLOG("Failed to start vhost driver for controller %s (%d): %s\n", + name, errno, spdk_strerror(errno)); + rte_vhost_driver_unregister(path); + rc = -EIO; + goto out; + } + + vdev->name = strdup(name); + vdev->path = strdup(path); + vdev->id = ctrlr_num++; + vdev->vid = -1; + vdev->lcore = -1; + vdev->cpumask = cpumask; + vdev->registered = true; + vdev->backend = backend; + + spdk_vhost_set_coalescing(vdev, SPDK_VHOST_COALESCING_DELAY_BASE_US, + SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD); + vdev->next_stats_check_time = 0; + vdev->stats_check_interval = SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS * spdk_get_ticks_hz() / + 1000UL; + + TAILQ_INSERT_TAIL(&g_spdk_vhost_devices, vdev, tailq); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: new controller added\n", vdev->name); + return 0; + +out: + spdk_cpuset_free(cpumask); + return rc; +} + +int +spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev) +{ + if (vdev->vid != -1) { + SPDK_ERRLOG("Controller %s has still valid connection.\n", vdev->name); + return -EBUSY; + } + + if (vdev->registered && rte_vhost_driver_unregister(vdev->path) != 0) { + SPDK_ERRLOG("Could not unregister controller %s with vhost library\n" + "Check if domain socket %s still exists\n", + vdev->name, vdev->path); + return -EIO; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: removed\n", vdev->name); + + free(vdev->name); + free(vdev->path); + spdk_cpuset_free(vdev->cpumask); + TAILQ_REMOVE(&g_spdk_vhost_devices, vdev, tailq); + return 0; +} + +static struct spdk_vhost_dev * +spdk_vhost_dev_next(unsigned i) +{ + struct spdk_vhost_dev *vdev; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + if (vdev->id > i) { + return vdev; + } + } + + return NULL; +} + +const char * +spdk_vhost_dev_get_name(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->name; +} + +const struct spdk_cpuset * +spdk_vhost_dev_get_cpumask(struct spdk_vhost_dev *vdev) +{ + assert(vdev != NULL); + return vdev->cpumask; +} + +static uint32_t +spdk_vhost_allocate_reactor(struct spdk_cpuset *cpumask) +{ + uint32_t i, selected_core; + uint32_t min_ctrlrs; + + min_ctrlrs = INT_MAX; + selected_core = spdk_env_get_first_core(); + + SPDK_ENV_FOREACH_CORE(i) { + if (!spdk_cpuset_get_cpu(cpumask, i)) { + continue; + } + + if (g_num_ctrlrs[i] < min_ctrlrs) { + selected_core = i; + min_ctrlrs = g_num_ctrlrs[i]; + } + } + + g_num_ctrlrs[selected_core]++; + return selected_core; +} + +void +spdk_vhost_dev_backend_event_done(void *event_ctx, int response) +{ + struct spdk_vhost_dev_event_ctx *ctx = event_ctx; + + ctx->response = response; + sem_post(&ctx->sem); +} + +static void +spdk_vhost_event_cb(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + + ctx->cb_fn(ctx->vdev, ctx); +} + +static void +spdk_vhost_event_async_fn(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + struct spdk_vhost_dev *vdev; + struct spdk_event *ev; + + if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) { + ev = spdk_event_allocate(spdk_env_get_current_core(), spdk_vhost_event_async_fn, arg1, arg2); + spdk_event_call(ev); + return; + } + + vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id); + if (vdev != ctx->vdev) { + /* vdev has been changed after enqueuing this event */ + vdev = NULL; + } + + if (vdev != NULL && vdev->lcore >= 0 && + (uint32_t)vdev->lcore != spdk_env_get_current_core()) { + /* if vdev has been relocated to other core, it is no longer thread-safe + * to access its contents here. Even though we're running under global vhost + * mutex, the controller itself (and its pollers) are not. We need to chase + * the vdev thread as many times as necessary. + */ + ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_async_fn, arg1, arg2); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + ctx->cb_fn(vdev, arg2); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + free(ctx); +} + +static void spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev, + spdk_vhost_event_fn fn, void *arg); + +static void +spdk_vhost_event_async_foreach_fn(void *arg1, void *arg2) +{ + struct spdk_vhost_dev_event_ctx *ctx = arg1; + struct spdk_vhost_dev *vdev; + struct spdk_event *ev; + + if (pthread_mutex_trylock(&g_spdk_vhost_mutex) != 0) { + ev = spdk_event_allocate(spdk_env_get_current_core(), + spdk_vhost_event_async_foreach_fn, arg1, arg2); + spdk_event_call(ev); + return; + } + + vdev = spdk_vhost_dev_find_by_id(ctx->vdev_id); + if (vdev != ctx->vdev) { + /* ctx->vdev is probably a dangling pointer at this point. + * It must have been removed in the meantime, so we just skip + * it in our foreach chain. */ + goto out_unlock_continue; + } + + /* the assert is just for static analyzers, vdev cannot be NULL here */ + assert(vdev != NULL); + if (vdev->lcore >= 0 && + (uint32_t)vdev->lcore != spdk_env_get_current_core()) { + /* if vdev has been relocated to other core, it is no longer thread-safe + * to access its contents here. Even though we're running under global vhost + * mutex, the controller itself (and its pollers) are not. We need to chase + * the vdev thread as many times as necessary. + */ + ev = spdk_event_allocate(vdev->lcore, + spdk_vhost_event_async_foreach_fn, arg1, arg2); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + ctx->cb_fn(vdev, arg2); + +out_unlock_continue: + vdev = spdk_vhost_dev_next(ctx->vdev_id); + spdk_vhost_external_event_foreach_continue(vdev, ctx->cb_fn, arg2); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + free(ctx); +} + +static int +_spdk_vhost_event_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, + unsigned timeout_sec, const char *errmsg) +{ + struct spdk_vhost_dev_event_ctx ev_ctx = {0}; + struct spdk_event *ev; + struct timespec timeout; + int rc; + + rc = sem_init(&ev_ctx.sem, 0, 0); + if (rc != 0) { + SPDK_ERRLOG("Failed to initialize semaphore for vhost timed event\n"); + return -errno; + } + + ev_ctx.vdev = vdev; + ev_ctx.cb_fn = cb_fn; + ev = spdk_event_allocate(vdev->lcore, spdk_vhost_event_cb, &ev_ctx, NULL); + assert(ev); + spdk_event_call(ev); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + + clock_gettime(CLOCK_REALTIME, &timeout); + timeout.tv_sec += timeout_sec; + + rc = sem_timedwait(&ev_ctx.sem, &timeout); + if (rc != 0) { + SPDK_ERRLOG("Timeout waiting for event: %s.\n", errmsg); + sem_wait(&ev_ctx.sem); + } + + sem_destroy(&ev_ctx.sem); + pthread_mutex_lock(&g_spdk_vhost_mutex); + return ev_ctx.response; +} + +static int +spdk_vhost_event_async_send(struct spdk_vhost_dev *vdev, spdk_vhost_event_fn cb_fn, void *arg, + bool foreach) +{ + struct spdk_vhost_dev_event_ctx *ev_ctx; + struct spdk_event *ev; + spdk_event_fn fn; + + ev_ctx = calloc(1, sizeof(*ev_ctx)); + if (ev_ctx == NULL) { + SPDK_ERRLOG("Failed to alloc vhost event.\n"); + assert(false); + return -ENOMEM; + } + + ev_ctx->vdev = vdev; + ev_ctx->vdev_id = vdev->id; + ev_ctx->cb_fn = cb_fn; + + fn = foreach ? spdk_vhost_event_async_foreach_fn : spdk_vhost_event_async_fn; + ev = spdk_event_allocate(ev_ctx->vdev->lcore, fn, ev_ctx, arg); + assert(ev); + spdk_event_call(ev); + + return 0; +} + +static void +stop_device(int vid) +{ + struct spdk_vhost_dev *vdev; + struct rte_vhost_vring *q; + int rc; + uint16_t i; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to stop.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + if (vdev->lcore == -1) { + SPDK_ERRLOG("Controller %s is not loaded.\n", vdev->name); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + rc = _spdk_vhost_event_send(vdev, vdev->backend->stop_device, 3, "stop device"); + if (rc != 0) { + SPDK_ERRLOG("Couldn't stop device with vid %d.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + for (i = 0; i < vdev->max_queues; i++) { + q = &vdev->virtqueue[i].vring; + if (q->desc == NULL) { + continue; + } + rte_vhost_set_vhost_vring_last_idx(vdev->vid, i, q->last_avail_idx, q->last_used_idx); + } + + spdk_vhost_dev_mem_unregister(vdev); + free(vdev->mem); + spdk_vhost_free_reactor(vdev->lcore); + vdev->lcore = -1; + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +static int +start_device(int vid) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + uint16_t i; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->lcore != -1) { + SPDK_ERRLOG("Controller %s already loaded.\n", vdev->name); + goto out; + } + + vdev->max_queues = 0; + memset(vdev->virtqueue, 0, sizeof(vdev->virtqueue)); + for (i = 0; i < SPDK_VHOST_MAX_VQUEUES; i++) { + if (rte_vhost_get_vhost_vring(vid, i, &vdev->virtqueue[i].vring)) { + continue; + } + + if (vdev->virtqueue[i].vring.desc == NULL || + vdev->virtqueue[i].vring.size == 0) { + continue; + } + + /* Disable notifications. */ + if (rte_vhost_enable_guest_notification(vid, i, 0) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to disable guest notification on queue %"PRIu16"\n", vid, i); + goto out; + } + + vdev->max_queues = i + 1; + } + + if (rte_vhost_get_negotiated_features(vid, &vdev->negotiated_features) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get negotiated driver features\n", vid); + goto out; + } + + if (rte_vhost_get_mem_table(vid, &vdev->mem) != 0) { + SPDK_ERRLOG("vhost device %d: Failed to get guest memory table\n", vid); + goto out; + } + + /* + * Not sure right now but this look like some kind of QEMU bug and guest IO + * might be frozed without kicking all queues after live-migration. This look like + * the previous vhost instance failed to effectively deliver all interrupts before + * the GET_VRING_BASE message. This shouldn't harm guest since spurious interrupts + * should be ignored by guest virtio driver. + * + * Tested on QEMU 2.10.91 and 2.11.50. + */ + for (i = 0; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.callfd != -1) { + eventfd_write(vdev->virtqueue[i].vring.callfd, (eventfd_t)1); + } + } + + vdev->lcore = spdk_vhost_allocate_reactor(vdev->cpumask); + spdk_vhost_dev_mem_register(vdev); + rc = _spdk_vhost_event_send(vdev, vdev->backend->start_device, 3, "start device"); + if (rc != 0) { + spdk_vhost_dev_mem_unregister(vdev); + free(vdev->mem); + spdk_vhost_free_reactor(vdev->lcore); + vdev->lcore = -1; + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +static int +get_config(int vid, uint8_t *config, uint32_t len) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->backend->vhost_get_config) { + rc = vdev->backend->vhost_get_config(vdev, config, len); + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +static int +set_config(int vid, uint8_t *config, uint32_t offset, uint32_t size, uint32_t flags) +{ + struct spdk_vhost_dev *vdev; + int rc = -1; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Controller with vid %d doesn't exist.\n", vid); + goto out; + } + + if (vdev->backend->vhost_set_config) { + rc = vdev->backend->vhost_set_config(vdev, config, offset, size, flags); + } + +out: + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return rc; +} + +int +spdk_vhost_set_socket_path(const char *basename) +{ + int ret; + + if (basename && strlen(basename) > 0) { + ret = snprintf(dev_dirname, sizeof(dev_dirname) - 2, "%s", basename); + if (ret <= 0) { + return -EINVAL; + } + if ((size_t)ret >= sizeof(dev_dirname) - 2) { + SPDK_ERRLOG("Char dev dir path length %d is too long\n", ret); + return -EINVAL; + } + + if (dev_dirname[ret - 1] != '/') { + dev_dirname[ret] = '/'; + dev_dirname[ret + 1] = '\0'; + } + } + + return 0; +} + +static void * +session_shutdown(void *arg) +{ + struct spdk_vhost_dev *vdev = NULL; + + TAILQ_FOREACH(vdev, &g_spdk_vhost_devices, tailq) { + rte_vhost_driver_unregister(vdev->path); + vdev->registered = false; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Exiting\n"); + spdk_event_call((struct spdk_event *)arg); + return NULL; +} + +void +spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + assert(vdev->backend->dump_info_json != NULL); + vdev->backend->dump_info_json(vdev, w); +} + +int +spdk_vhost_dev_remove(struct spdk_vhost_dev *vdev) +{ + return vdev->backend->remove_device(vdev); +} + +static int +new_connection(int vid) +{ + struct spdk_vhost_dev *vdev; + char ifname[PATH_MAX]; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + if (rte_vhost_get_ifname(vid, ifname, PATH_MAX) < 0) { + SPDK_ERRLOG("Couldn't get a valid ifname for device with vid %d\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + vdev = spdk_vhost_dev_find(ifname); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to create connection for.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + /* since pollers are not running it safe not to use spdk_event here */ + if (vdev->vid != -1) { + SPDK_ERRLOG("Device with vid %d is already connected.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return -1; + } + + vdev->vid = vid; + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return 0; +} + +static void +destroy_connection(int vid) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find_by_vid(vid); + if (vdev == NULL) { + SPDK_ERRLOG("Couldn't find device with vid %d to destroy connection for.\n", vid); + pthread_mutex_unlock(&g_spdk_vhost_mutex); + return; + } + + /* since pollers are not running it safe not to use spdk_event here */ + vdev->vid = -1; + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_call_external_event(const char *ctrlr_name, spdk_vhost_event_fn fn, void *arg) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = spdk_vhost_dev_find(ctrlr_name); + + if (vdev == NULL) { + pthread_mutex_unlock(&g_spdk_vhost_mutex); + fn(NULL, arg); + return; + } + + if (vdev->lcore == -1) { + fn(vdev, arg); + } else { + spdk_vhost_event_async_send(vdev, fn, arg, false); + } + + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +static void +spdk_vhost_external_event_foreach_continue(struct spdk_vhost_dev *vdev, + spdk_vhost_event_fn fn, void *arg) +{ + if (vdev == NULL) { + fn(NULL, arg); + return; + } + + while (vdev->lcore == -1) { + fn(vdev, arg); + vdev = spdk_vhost_dev_next(vdev->id); + if (vdev == NULL) { + fn(NULL, arg); + return; + } + } + + spdk_vhost_event_async_send(vdev, fn, arg, true); +} + +void +spdk_vhost_call_external_event_foreach(spdk_vhost_event_fn fn, void *arg) +{ + struct spdk_vhost_dev *vdev; + + pthread_mutex_lock(&g_spdk_vhost_mutex); + vdev = TAILQ_FIRST(&g_spdk_vhost_devices); + spdk_vhost_external_event_foreach_continue(vdev, fn, arg); + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_lock(void) +{ + pthread_mutex_lock(&g_spdk_vhost_mutex); +} + +void +spdk_vhost_unlock(void) +{ + pthread_mutex_unlock(&g_spdk_vhost_mutex); +} + +int +spdk_vhost_init(void) +{ + uint32_t last_core; + size_t len; + int ret; + + if (dev_dirname[0] == '\0') { + if (getcwd(dev_dirname, sizeof(dev_dirname) - 1) == NULL) { + SPDK_ERRLOG("getcwd failed (%d): %s\n", errno, spdk_strerror(errno)); + return -1; + } + + len = strlen(dev_dirname); + if (dev_dirname[len - 1] != '/') { + dev_dirname[len] = '/'; + dev_dirname[len + 1] = '\0'; + } + } + + last_core = spdk_env_get_last_core(); + g_num_ctrlrs = calloc(last_core + 1, sizeof(uint32_t)); + if (!g_num_ctrlrs) { + SPDK_ERRLOG("Could not allocate array size=%u for g_num_ctrlrs\n", + last_core + 1); + return -1; + } + + ret = spdk_vhost_scsi_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost controllers\n"); + return -1; + } + + ret = spdk_vhost_blk_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost block controllers\n"); + return -1; + } + + ret = spdk_vhost_nvme_controller_construct(); + if (ret != 0) { + SPDK_ERRLOG("Cannot construct vhost NVMe controllers\n"); + return -1; + } + + return 0; +} + +static int +_spdk_vhost_fini_remove_vdev_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + spdk_vhost_fini_cb fini_cb = arg; + + if (vdev != NULL) { + spdk_vhost_dev_remove(vdev); + return 0; + } + + /* All devices are removed now. */ + free(g_num_ctrlrs); + fini_cb(); + return 0; +} + +static void +_spdk_vhost_fini(void *arg1, void *arg2) +{ + spdk_vhost_fini_cb fini_cb = arg1; + + spdk_vhost_call_external_event_foreach(_spdk_vhost_fini_remove_vdev_cb, fini_cb); +} + +void +spdk_vhost_fini(spdk_vhost_fini_cb fini_cb) +{ + pthread_t tid; + int rc; + struct spdk_event *fini_ev; + + fini_ev = spdk_event_allocate(spdk_env_get_current_core(), _spdk_vhost_fini, fini_cb, NULL); + + /* rte_vhost API for removing sockets is not asynchronous. Since it may call SPDK + * ops for stopping a device or removing a connection, we need to call it from + * a separate thread to avoid deadlock. + */ + rc = pthread_create(&tid, NULL, &session_shutdown, fini_ev); + if (rc < 0) { + SPDK_ERRLOG("Failed to start session shutdown thread (%d): %s\n", rc, spdk_strerror(rc)); + abort(); + } + pthread_detach(tid); +} + +struct spdk_vhost_write_config_json_ctx { + struct spdk_json_write_ctx *w; + struct spdk_event *done_ev; +}; + +static int +spdk_vhost_config_json_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_vhost_write_config_json_ctx *ctx = arg; + uint32_t delay_base_us; + uint32_t iops_threshold; + + if (vdev == NULL) { + spdk_json_write_array_end(ctx->w); + spdk_event_call(ctx->done_ev); + free(ctx); + return 0; + } + + vdev->backend->write_config_json(vdev, ctx->w); + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + if (delay_base_us) { + spdk_json_write_object_begin(ctx->w); + spdk_json_write_named_string(ctx->w, "method", "set_vhost_controller_coalescing"); + + spdk_json_write_named_object_begin(ctx->w, "params"); + spdk_json_write_named_string(ctx->w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(ctx->w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(ctx->w, "iops_threshold", iops_threshold); + spdk_json_write_object_end(ctx->w); + + spdk_json_write_object_end(ctx->w); + } + + return 0; +} + +void +spdk_vhost_config_json(struct spdk_json_write_ctx *w, struct spdk_event *done_ev) +{ + struct spdk_vhost_write_config_json_ctx *ctx; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + spdk_event_call(done_ev); + return; + } + + ctx->w = w; + ctx->done_ev = done_ev; + + spdk_json_write_array_begin(w); + + spdk_vhost_call_external_event_foreach(spdk_vhost_config_json_cb, ctx); +} + +SPDK_LOG_REGISTER_COMPONENT("vhost", SPDK_LOG_VHOST) +SPDK_LOG_REGISTER_COMPONENT("vhost_ring", SPDK_LOG_VHOST_RING) diff --git a/src/spdk/lib/vhost/vhost_blk.c b/src/spdk/lib/vhost/vhost_blk.c new file mode 100644 index 00000000..6a9a1896 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_blk.c @@ -0,0 +1,901 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/vhost.h" + +#include "vhost_internal.h" + +struct spdk_vhost_blk_task { + struct spdk_bdev_io *bdev_io; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_vhost_virtqueue *vq; + + volatile uint8_t *status; + + uint16_t req_idx; + + /* for io wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + /** Number of bytes that were written. */ + uint32_t used_len; + uint16_t iovcnt; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; +}; + +struct spdk_vhost_blk_dev { + struct spdk_vhost_dev vdev; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_poller *requestq_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; + bool readonly; +}; + +/* forward declaration */ +static const struct spdk_vhost_dev_backend vhost_blk_device_backend; + +static int +process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev, + struct spdk_vhost_virtqueue *vq); + +static void +blk_task_finish(struct spdk_vhost_blk_task *task) +{ + assert(task->bvdev->vdev.task_cnt > 0); + task->bvdev->vdev.task_cnt--; + task->used = false; +} + +static void +invalid_blk_request(struct spdk_vhost_blk_task *task, uint8_t status) +{ + if (task->status) { + *task->status = status; + } + + spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx, + task->used_len); + blk_task_finish(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Invalid request (status=%" PRIu8")\n", status); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * total size of suplied buffers + * + * FIXME: Make this function return to rd_cnt and wr_cnt + */ +static int +blk_iovs_setup(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, uint16_t req_idx, + struct iovec *iovs, uint16_t *iovs_cnt, uint32_t *length) +{ + struct vring_desc *desc, *desc_table; + uint16_t out_cnt = 0, cnt = 0; + uint32_t desc_table_size, len = 0; + int rc; + + rc = spdk_vhost_vq_get_desc(vdev, vq, req_idx, &desc, &desc_table, &desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: Invalid descriptor at index %"PRIu16".\n", vdev->name, req_idx); + return -1; + } + + while (1) { + /* + * Maximum cnt reached? + * Should not happen if request is well formatted, otherwise this is a BUG. + */ + if (spdk_unlikely(cnt == *iovs_cnt)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Max IOVs in request reached (req_idx = %"PRIu16").\n", + req_idx); + return -1; + } + + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &cnt, desc))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid descriptor %" PRIu16" (req_idx = %"PRIu16").\n", + req_idx, cnt); + return -1; + } + + len += desc->len; + + out_cnt += spdk_vhost_vring_desc_is_wr(desc); + + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (rc != 0) { + SPDK_ERRLOG("%s: Descriptor chain at index %"PRIu16" terminated unexpectedly.\n", + vdev->name, req_idx); + return -1; + } else if (desc == NULL) { + break; + } + } + + /* + * There must be least two descriptors. + * First contain request so it must be readable. + * Last descriptor contain buffer for response so it must be writable. + */ + if (spdk_unlikely(out_cnt == 0 || cnt < 2)) { + return -1; + } + + *length = len; + *iovs_cnt = cnt; + return 0; +} + +static void +blk_request_finish(bool success, struct spdk_vhost_blk_task *task) +{ + *task->status = success ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR; + spdk_vhost_vq_used_ring_enqueue(&task->bvdev->vdev, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Finished task (%p) req_idx=%d\n status: %s\n", task, + task->req_idx, success ? "OK" : "FAIL"); + blk_task_finish(task); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_blk_task *task = cb_arg; + + spdk_bdev_free_io(bdev_io); + blk_request_finish(success, task); +} + +static void +blk_request_resubmit(void *arg) +{ + struct spdk_vhost_blk_task *task = (struct spdk_vhost_blk_task *)arg; + int rc = 0; + + rc = process_blk_request(task, task->bvdev, task->vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p resubmitted ======\n", task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p failed ======\n", task); + } +} + +static inline void +blk_request_queue_io(struct spdk_vhost_blk_task *task) +{ + int rc; + struct spdk_vhost_blk_dev *bvdev = task->bvdev; + struct spdk_bdev *bdev = bvdev->bdev; + + task->bdev_io_wait.bdev = bdev; + task->bdev_io_wait.cb_fn = blk_request_resubmit; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(bdev, bvdev->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_blk, rc=%d\n", rc); + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + } +} + +static int +process_blk_request(struct spdk_vhost_blk_task *task, struct spdk_vhost_blk_dev *bvdev, + struct spdk_vhost_virtqueue *vq) +{ + const struct virtio_blk_outhdr *req; + struct iovec *iov; + uint32_t type; + uint32_t payload_len; + int rc; + + if (blk_iovs_setup(&bvdev->vdev, vq, task->req_idx, task->iovs, &task->iovcnt, &payload_len)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Invalid request (req_idx = %"PRIu16").\n", task->req_idx); + /* Only READ and WRITE are supported for now. */ + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + iov = &task->iovs[0]; + if (spdk_unlikely(iov->iov_len != sizeof(*req))) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "First descriptor size is %zu but expected %zu (req_idx = %"PRIu16").\n", + iov->iov_len, sizeof(*req), task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + req = iov->iov_base; + + iov = &task->iovs[task->iovcnt - 1]; + if (spdk_unlikely(iov->iov_len != 1)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, + "Last descriptor size is %zu but expected %d (req_idx = %"PRIu16").\n", + iov->iov_len, 1, task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + task->status = iov->iov_base; + payload_len -= sizeof(*req) + sizeof(*task->status); + task->iovcnt -= 2; + + type = req->type; +#ifdef VIRTIO_BLK_T_BARRIER + /* Don't care about barier for now (as QEMU's virtio-blk do). */ + type &= ~VIRTIO_BLK_T_BARRIER; +#endif + + switch (type) { + case VIRTIO_BLK_T_IN: + case VIRTIO_BLK_T_OUT: + if (spdk_unlikely((payload_len & (512 - 1)) != 0)) { + SPDK_ERRLOG("%s - passed IO buffer is not multiple of 512b (req_idx = %"PRIu16").\n", + type ? "WRITE" : "READ", task->req_idx); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + if (type == VIRTIO_BLK_T_IN) { + task->used_len = payload_len + sizeof(*task->status); + rc = spdk_bdev_readv(bvdev->bdev_desc, bvdev->bdev_io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else if (!bvdev->readonly) { + task->used_len = sizeof(*task->status); + rc = spdk_bdev_writev(bvdev->bdev_desc, bvdev->bdev_io_channel, + &task->iovs[1], task->iovcnt, req->sector * 512, + payload_len, blk_request_complete_cb, task); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Device is in read-only mode!\n"); + rc = -1; + } + + if (rc) { + if (rc == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "No memory, start to queue io.\n"); + blk_request_queue_io(task); + } else { + invalid_blk_request(task, VIRTIO_BLK_S_IOERR); + return -1; + } + } + break; + case VIRTIO_BLK_T_GET_ID: + if (!task->iovcnt || !payload_len) { + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + task->used_len = spdk_min((size_t)VIRTIO_BLK_ID_BYTES, task->iovs[1].iov_len); + spdk_strcpy_pad(task->iovs[1].iov_base, spdk_bdev_get_product_name(bvdev->bdev), + task->used_len, ' '); + blk_request_finish(true, task); + break; + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "Not supported request type '%"PRIu32"'.\n", type); + invalid_blk_request(task, VIRTIO_BLK_S_UNSUPP); + return -1; + } + + return 0; +} + +static void +process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_blk_task *task; + int rc; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + if (!reqs_cnt) { + return; + } + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + bvdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_blk_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + bvdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, reqs[i], 0); + continue; + } + + bvdev->vdev.task_cnt++; + + task->used = true; + task->iovcnt = SPDK_COUNTOF(task->iovs); + task->status = NULL; + task->used_len = 0; + + rc = process_blk_request(task, bvdev, vq); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d submitted ======\n", task, + reqs[i]); + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK, "====== Task %p req_idx %d failed ======\n", task, reqs[i]); + } + } +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + uint16_t q_idx; + + for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) { + process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&bvdev->vdev); + + return -1; +} + +static void +no_bdev_process_vq(struct spdk_vhost_blk_dev *bvdev, struct spdk_vhost_virtqueue *vq) +{ + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + uint32_t length; + uint16_t iovcnt, req_idx; + + if (spdk_vhost_vq_avail_ring_get(vq, &req_idx, 1) != 1) { + return; + } + + iovcnt = SPDK_COUNTOF(iovs); + if (blk_iovs_setup(&bvdev->vdev, vq, req_idx, iovs, &iovcnt, &length) == 0) { + *(volatile uint8_t *)iovs[iovcnt - 1].iov_base = VIRTIO_BLK_S_IOERR; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_BLK_DATA, "Aborting request %" PRIu16"\n", req_idx); + } + + spdk_vhost_vq_used_ring_enqueue(&bvdev->vdev, vq, req_idx, 0); +} + +static int +no_bdev_vdev_worker(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + uint16_t q_idx; + + for (q_idx = 0; q_idx < bvdev->vdev.max_queues; q_idx++) { + no_bdev_process_vq(bvdev, &bvdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&bvdev->vdev); + + if (bvdev->vdev.task_cnt == 0 && bvdev->bdev_io_channel) { + spdk_put_io_channel(bvdev->bdev_io_channel); + bvdev->bdev_io_channel = NULL; + } + + return -1; +} + +static struct spdk_vhost_blk_dev * +to_blk_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev == NULL) { + return NULL; + } + + if (vdev->backend != &vhost_blk_device_backend) { + SPDK_ERRLOG("%s: not a vhost-blk device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_blk_dev, vdev); +} + +struct spdk_bdev * +spdk_vhost_blk_get_dev(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + + assert(bvdev != NULL); + return bvdev->bdev; +} + +static int +_bdev_remove_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + + SPDK_WARNLOG("Controller %s: Hot-removing bdev - all further requests will fail.\n", + bvdev->vdev.name); + if (bvdev->requestq_poller) { + spdk_poller_unregister(&bvdev->requestq_poller); + bvdev->requestq_poller = spdk_poller_register(no_bdev_vdev_worker, bvdev, 0); + } + + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + bvdev->bdev = NULL; + return 0; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_blk_dev *bvdev = remove_ctx; + + spdk_vhost_call_external_event(bvdev->vdev.name, _bdev_remove_cb, bvdev); +} + +static void +free_task_pool(struct spdk_vhost_blk_dev *bvdev) +{ + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + vq = &bvdev->vdev.virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_dma_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_blk_dev *bvdev) +{ + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_blk_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + vq = &bvdev->vdev.virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + bvdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(bvdev); + return -1; + } + vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_blk_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL); + if (vq->tasks == NULL) { + SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + bvdev->vdev.name, task_cnt, i); + free_task_pool(bvdev); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_blk_task *)vq->tasks)[j]; + task->bvdev = bvdev; + task->req_idx = j; + task->vq = vq; + } + } + + return 0; +} + +/* + * A new device is added to a data core. First the device is added to the main linked list + * and then allocated to a specific data core. + * + */ +static int +spdk_vhost_blk_start(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_blk_dev *bvdev; + int i, rc = 0; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to start non-blk controller as a blk one.\n"); + rc = -1; + goto out; + } + + /* validate all I/O queues are in a contiguous index range */ + for (i = 0; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(bvdev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", bvdev->vdev.name); + goto out; + } + + if (bvdev->bdev) { + bvdev->bdev_io_channel = spdk_bdev_get_io_channel(bvdev->bdev_desc); + if (!bvdev->bdev_io_channel) { + free_task_pool(bvdev); + SPDK_ERRLOG("Controller %s: IO channel allocation failed\n", vdev->name); + rc = -1; + goto out; + } + } + + bvdev->requestq_poller = spdk_poller_register(bvdev->bdev ? vdev_worker : no_bdev_vdev_worker, + bvdev, 0); + SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n", + vdev->name, vdev->lcore); +out: + spdk_vhost_dev_backend_event_done(event_ctx, rc); + return rc; +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_blk_dev *bvdev = arg; + int i; + + if (bvdev->vdev.task_cnt > 0) { + return -1; + } + + for (i = 0; i < bvdev->vdev.max_queues; i++) { + bvdev->vdev.virtqueue[i].next_event_time = 0; + spdk_vhost_vq_used_signal(&bvdev->vdev, &bvdev->vdev.virtqueue[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", bvdev->vdev.name); + + if (bvdev->bdev_io_channel) { + spdk_put_io_channel(bvdev->bdev_io_channel); + bvdev->bdev_io_channel = NULL; + } + + free_task_pool(bvdev); + spdk_poller_unregister(&bvdev->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(bvdev->destroy_ctx.event_ctx, 0); + + return -1; +} + +static int +spdk_vhost_blk_stop(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to stop non-blk controller as a blk one.\n"); + goto err; + } + + bvdev->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&bvdev->requestq_poller); + bvdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, + bvdev, 1000); + return 0; + +err: + spdk_vhost_dev_backend_event_done(event_ctx, -1); + return -1; +} + +static void +spdk_vhost_blk_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_bdev *bdev = spdk_vhost_blk_get_dev(vdev); + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + return; + } + + assert(bvdev != NULL); + spdk_json_write_name(w, "block"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "readonly"); + spdk_json_write_bool(w, bvdev->readonly); + + spdk_json_write_name(w, "bdev"); + if (bdev) { + spdk_json_write_string(w, spdk_bdev_get_name(bdev)); + } else { + spdk_json_write_null(w); + } + + spdk_json_write_object_end(w); +} + +static void +spdk_vhost_blk_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_blk_dev *bvdev; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + return; + } + + if (!bvdev->bdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_blk_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "dev_name", spdk_bdev_get_name(bvdev->bdev)); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_named_bool(w, "readonly", bvdev->readonly); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int spdk_vhost_blk_destroy(struct spdk_vhost_dev *dev); + +static int +spdk_vhost_blk_get_config(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t len) +{ + struct virtio_blk_config *blkcfg = (struct virtio_blk_config *)config; + struct spdk_vhost_blk_dev *bvdev; + struct spdk_bdev *bdev; + uint32_t blk_size; + uint64_t blkcnt; + + bvdev = to_blk_dev(vdev); + if (bvdev == NULL) { + SPDK_ERRLOG("Trying to get virito_blk configuration failed\n"); + return -1; + } + + if (len < sizeof(*blkcfg)) { + return -1; + } + + bdev = bvdev->bdev; + if (bdev == NULL) { + /* We can't just return -1 here as this GET_CONFIG message might + * be caused by a QEMU VM reboot. Returning -1 will indicate an + * error to QEMU, who might then decide to terminate itself. + * We don't want that. A simple reboot shouldn't break the system. + * + * Presenting a block device with block size 0 and block count 0 + * doesn't cause any problems on QEMU side and the virtio-pci + * device is even still available inside the VM, but there will + * be no block device created for it - the kernel drivers will + * silently reject it. + */ + blk_size = 0; + blkcnt = 0; + } else { + blk_size = spdk_bdev_get_block_size(bdev); + blkcnt = spdk_bdev_get_num_blocks(bdev); + } + + memset(blkcfg, 0, sizeof(*blkcfg)); + blkcfg->blk_size = blk_size; + /* minimum I/O size in blocks */ + blkcfg->min_io_size = 1; + /* expressed in 512 Bytes sectors */ + blkcfg->capacity = (blkcnt * blk_size) / 512; + blkcfg->size_max = 131072; + /* -2 for REQ and RESP and -1 for region boundary splitting */ + blkcfg->seg_max = SPDK_VHOST_IOVS_MAX - 2 - 1; + /* QEMU can overwrite this value when started */ + blkcfg->num_queues = SPDK_VHOST_MAX_VQUEUES; + + return 0; +} + +static const struct spdk_vhost_dev_backend vhost_blk_device_backend = { + .virtio_features = SPDK_VHOST_FEATURES | + (1ULL << VIRTIO_BLK_F_SIZE_MAX) | (1ULL << VIRTIO_BLK_F_SEG_MAX) | + (1ULL << VIRTIO_BLK_F_GEOMETRY) | (1ULL << VIRTIO_BLK_F_RO) | + (1ULL << VIRTIO_BLK_F_BLK_SIZE) | (1ULL << VIRTIO_BLK_F_TOPOLOGY) | + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI) | + (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | + (1ULL << VIRTIO_BLK_F_MQ), + .disabled_features = SPDK_VHOST_DISABLED_FEATURES | (1ULL << VIRTIO_BLK_F_GEOMETRY) | + (1ULL << VIRTIO_BLK_F_RO) | (1ULL << VIRTIO_BLK_F_FLUSH) | (1ULL << VIRTIO_BLK_F_CONFIG_WCE) | + (1ULL << VIRTIO_BLK_F_BARRIER) | (1ULL << VIRTIO_BLK_F_SCSI), + .start_device = spdk_vhost_blk_start, + .stop_device = spdk_vhost_blk_stop, + .vhost_get_config = spdk_vhost_blk_get_config, + .dump_info_json = spdk_vhost_blk_dump_info_json, + .write_config_json = spdk_vhost_blk_write_config_json, + .remove_device = spdk_vhost_blk_destroy, +}; + +int +spdk_vhost_blk_controller_construct(void) +{ + struct spdk_conf_section *sp; + unsigned ctrlr_num; + char *bdev_name; + char *cpumask; + char *name; + bool readonly; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostBlk")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostBlk%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostBlk%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + readonly = spdk_conf_section_get_boolval(sp, "ReadOnly", false); + + bdev_name = spdk_conf_section_get_val(sp, "Dev"); + if (bdev_name == NULL) { + continue; + } + + if (spdk_vhost_blk_construct(name, cpumask, bdev_name, readonly) < 0) { + return -1; + } + } + + return 0; +} + +int +spdk_vhost_blk_construct(const char *name, const char *cpumask, const char *dev_name, bool readonly) +{ + struct spdk_vhost_blk_dev *bvdev = NULL; + struct spdk_bdev *bdev; + int ret = 0; + + spdk_vhost_lock(); + bdev = spdk_bdev_get_by_name(dev_name); + if (bdev == NULL) { + SPDK_ERRLOG("Controller %s: bdev '%s' not found\n", + name, dev_name); + ret = -ENODEV; + goto out; + } + + bvdev = spdk_dma_zmalloc(sizeof(*bvdev), SPDK_CACHE_LINE_SIZE, NULL); + if (bvdev == NULL) { + ret = -ENOMEM; + goto out; + } + + ret = spdk_bdev_open(bdev, true, bdev_remove_cb, bvdev, &bvdev->bdev_desc); + if (ret != 0) { + SPDK_ERRLOG("Controller %s: could not open bdev '%s', error=%d\n", + name, dev_name, ret); + goto out; + } + + bvdev->bdev = bdev; + bvdev->readonly = readonly; + ret = spdk_vhost_dev_register(&bvdev->vdev, name, cpumask, &vhost_blk_device_backend); + if (ret != 0) { + spdk_bdev_close(bvdev->bdev_desc); + goto out; + } + + if (readonly && rte_vhost_driver_enable_features(bvdev->vdev.path, (1ULL << VIRTIO_BLK_F_RO))) { + SPDK_ERRLOG("Controller %s: failed to set as a readonly\n", name); + spdk_bdev_close(bvdev->bdev_desc); + + if (spdk_vhost_dev_unregister(&bvdev->vdev) != 0) { + SPDK_ERRLOG("Controller %s: failed to remove controller\n", name); + } + + ret = -1; + goto out; + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: using bdev '%s'\n", name, dev_name); +out: + if (ret != 0 && bvdev) { + spdk_dma_free(bvdev); + } + spdk_vhost_unlock(); + return ret; +} + +static int +spdk_vhost_blk_destroy(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_blk_dev *bvdev = to_blk_dev(vdev); + int rc; + + if (!bvdev) { + return -EINVAL; + } + + rc = spdk_vhost_dev_unregister(&bvdev->vdev); + if (rc != 0) { + return rc; + } + + if (bvdev->bdev_desc) { + spdk_bdev_close(bvdev->bdev_desc); + bvdev->bdev_desc = NULL; + } + bvdev->bdev = NULL; + + spdk_dma_free(bvdev); + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_blk", SPDK_LOG_VHOST_BLK) +SPDK_LOG_REGISTER_COMPONENT("vhost_blk_data", SPDK_LOG_VHOST_BLK_DATA) diff --git a/src/spdk/lib/vhost/vhost_internal.h b/src/spdk/lib/vhost/vhost_internal.h new file mode 100644 index 00000000..9c0ad211 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_internal.h @@ -0,0 +1,277 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_VHOST_INTERNAL_H +#define SPDK_VHOST_INTERNAL_H + +#include "spdk/stdinc.h" + +#include + +#include "spdk_internal/log.h" +#include "spdk/event.h" +#include "spdk/rpc.h" + +#define SPDK_CACHE_LINE_SIZE RTE_CACHE_LINE_SIZE + +#ifndef VHOST_USER_F_PROTOCOL_FEATURES +#define VHOST_USER_F_PROTOCOL_FEATURES 30 +#endif + +#ifndef VIRTIO_F_VERSION_1 +#define VIRTIO_F_VERSION_1 32 +#endif + +#ifndef VIRTIO_BLK_F_MQ +#define VIRTIO_BLK_F_MQ 12 /* support more than one vq */ +#endif + +#ifndef VIRTIO_BLK_F_CONFIG_WCE +#define VIRTIO_BLK_F_CONFIG_WCE 11 +#endif + +#define SPDK_VHOST_MAX_VQUEUES 256 +#define SPDK_VHOST_MAX_VQ_SIZE 1024 + +#define SPDK_VHOST_SCSI_CTRLR_MAX_DEVS 8 + +#define SPDK_VHOST_IOVS_MAX 129 + +/* + * Rate at which stats are checked for interrupt coalescing. + */ +#define SPDK_VHOST_DEV_STATS_CHECK_INTERVAL_MS 10 +/* + * Default threshold at which interrupts start to be coalesced. + */ +#define SPDK_VHOST_VQ_IOPS_COALESCING_THRESHOLD 60000 + +/* + * Currently coalescing is not used by default. + * Setting this to value > 0 here or by RPC will enable coalescing. + */ +#define SPDK_VHOST_COALESCING_DELAY_BASE_US 0 + + +#define SPDK_VHOST_FEATURES ((1ULL << VHOST_F_LOG_ALL) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \ + (1ULL << VIRTIO_F_VERSION_1) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_RING_F_INDIRECT_DESC)) + +#define SPDK_VHOST_DISABLED_FEATURES ((1ULL << VIRTIO_RING_F_EVENT_IDX) | \ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY)) + +struct spdk_vhost_virtqueue { + struct rte_vhost_vring vring; + void *tasks; + + /* Request count from last stats check */ + uint32_t req_cnt; + + /* Request count from last event */ + uint16_t used_req_cnt; + + /* How long interrupt is delayed */ + uint32_t irq_delay_time; + + /* Next time when we need to send event */ + uint64_t next_event_time; + +} __attribute((aligned(SPDK_CACHE_LINE_SIZE))); + +struct spdk_vhost_dev_backend { + uint64_t virtio_features; + uint64_t disabled_features; + + /** + * Callbacks for starting and pausing the device. + * The first param is struct spdk_vhost_dev *. + * The second one is event context that has to be + * passed to spdk_vhost_dev_backend_event_done(). + */ + spdk_vhost_event_fn start_device; + spdk_vhost_event_fn stop_device; + + int (*vhost_get_config)(struct spdk_vhost_dev *vdev, uint8_t *config, uint32_t len); + int (*vhost_set_config)(struct spdk_vhost_dev *vdev, uint8_t *config, + uint32_t offset, uint32_t size, uint32_t flags); + + void (*dump_info_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + void (*write_config_json)(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); + int (*remove_device)(struct spdk_vhost_dev *vdev); +}; + +struct spdk_vhost_dev { + struct rte_vhost_memory *mem; + char *name; + char *path; + + /* Unique device ID. */ + unsigned id; + + /* rte_vhost device ID. */ + int vid; + int task_cnt; + int32_t lcore; + struct spdk_cpuset *cpumask; + bool registered; + + const struct spdk_vhost_dev_backend *backend; + + /* Saved orginal values used to setup coalescing to avoid integer + * rounding issues during save/load config. + */ + uint32_t coalescing_delay_us; + uint32_t coalescing_iops_threshold; + + uint32_t coalescing_delay_time_base; + + /* Threshold when event coalescing for virtqueue will be turned on. */ + uint32_t coalescing_io_rate_threshold; + + /* Next time when stats for event coalescing will be checked. */ + uint64_t next_stats_check_time; + + /* Interval used for event coalescing checking. */ + uint64_t stats_check_interval; + + uint16_t max_queues; + + uint64_t negotiated_features; + + struct spdk_vhost_virtqueue virtqueue[SPDK_VHOST_MAX_VQUEUES]; + + TAILQ_ENTRY(spdk_vhost_dev) tailq; +}; + +struct spdk_vhost_dev_destroy_ctx { + struct spdk_poller *poller; + void *event_ctx; +}; + +struct spdk_vhost_dev *spdk_vhost_dev_find(const char *ctrlr_name); + +void *spdk_vhost_gpa_to_vva(struct spdk_vhost_dev *vdev, uint64_t addr, uint64_t len); + +uint16_t spdk_vhost_vq_avail_ring_get(struct spdk_vhost_virtqueue *vq, uint16_t *reqs, + uint16_t reqs_len); + +/** + * Get a virtio descriptor at given index in given virtqueue. + * The descriptor will provide access to the entire descriptor + * chain. The subsequent descriptors are accesible via + * \c spdk_vhost_vring_desc_get_next. + * \param vdev vhost device + * \param vq virtqueue + * \param req_idx descriptor index + * \param desc pointer to be set to the descriptor + * \param desc_table descriptor table to be used with + * \c spdk_vhost_vring_desc_get_next. This might be either + * default virtqueue descriptor table or per-chain indirect + * table. + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid. + * If -1 is returned, the content of params is undefined. + */ +int spdk_vhost_vq_get_desc(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, + uint16_t req_idx, struct vring_desc **desc, struct vring_desc **desc_table, + uint32_t *desc_table_size); + +/** + * Send IRQ/call client (if pending) for \c vq. + * \param vdev vhost device + * \param vq virtqueue + * \return + * 0 - if no interrupt was signalled + * 1 - if interrupt was signalled + */ +int spdk_vhost_vq_used_signal(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq); + + +/** + * Send IRQs for all queues that need to be signaled. + * \param vdev vhost device + * \param vq virtqueue + */ +void spdk_vhost_dev_used_signal(struct spdk_vhost_dev *vdev); + +void spdk_vhost_vq_used_ring_enqueue(struct spdk_vhost_dev *vdev, struct spdk_vhost_virtqueue *vq, + uint16_t id, uint32_t len); + +/** + * Get subsequent descriptor from given table. + * \param desc current descriptor, will be set to the + * next descriptor (NULL in case this is the last + * descriptor in the chain or the next desc is invalid) + * \param desc_table descriptor table + * \param desc_table_size size of the *desc_table* + * \return 0 on success, -1 if given index is invalid + * The *desc* param will be set regardless of the + * return value. + */ +int spdk_vhost_vring_desc_get_next(struct vring_desc **desc, + struct vring_desc *desc_table, uint32_t desc_table_size); +bool spdk_vhost_vring_desc_is_wr(struct vring_desc *cur_desc); + +int spdk_vhost_vring_desc_to_iov(struct spdk_vhost_dev *vdev, struct iovec *iov, + uint16_t *iov_index, const struct vring_desc *desc); + +static inline bool __attribute__((always_inline)) +spdk_vhost_dev_has_feature(struct spdk_vhost_dev *vdev, unsigned feature_id) +{ + return vdev->negotiated_features & (1ULL << feature_id); +} + +int spdk_vhost_dev_register(struct spdk_vhost_dev *vdev, const char *name, const char *mask_str, + const struct spdk_vhost_dev_backend *backend); +int spdk_vhost_dev_unregister(struct spdk_vhost_dev *vdev); + +int spdk_vhost_scsi_controller_construct(void); +int spdk_vhost_blk_controller_construct(void); +void spdk_vhost_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w); +void spdk_vhost_dev_backend_event_done(void *event_ctx, int response); +void spdk_vhost_lock(void); +void spdk_vhost_unlock(void); +int spdk_remove_vhost_controller(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf); +int spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd); +int spdk_vhost_nvme_get_cap(int vid, uint64_t *cap); +int spdk_vhost_nvme_controller_construct(void); +int spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t io_queues); +int spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev); +int spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, + const char *bdev_name); + +#endif /* SPDK_VHOST_INTERNAL_H */ diff --git a/src/spdk/lib/vhost/vhost_nvme.c b/src/spdk/lib/vhost/vhost_nvme.c new file mode 100644 index 00000000..35015d93 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_nvme.c @@ -0,0 +1,1465 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/env.h" +#include "spdk/conf.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/barrier.h" +#include "spdk/vhost.h" +#include "spdk/bdev.h" +#include "spdk/version.h" +#include "spdk/nvme_spec.h" +#include "spdk/likely.h" + +#include "vhost_internal.h" + +#define MAX_IO_QUEUES 31 +#define MAX_IOVS 64 +#define MAX_NAMESPACE 8 +#define MAX_QUEUE_ENTRIES_SUPPORTED 256 +#define MAX_BATCH_IO 8 + +struct spdk_vhost_nvme_sq { + uint16_t sqid; + uint16_t size; + uint16_t cqid; + bool valid; + struct spdk_nvme_cmd *sq_cmd; + uint16_t sq_head; + uint16_t sq_tail; +}; + +struct spdk_vhost_nvme_cq { + uint8_t phase; + uint16_t size; + uint16_t cqid; + bool valid; + volatile struct spdk_nvme_cpl *cq_cqe; + uint16_t cq_head; + uint16_t guest_signaled_cq_head; + uint32_t need_signaled_cnt; + STAILQ_HEAD(, spdk_vhost_nvme_task) cq_full_waited_tasks; + bool irq_enabled; + int virq; +}; + +struct spdk_vhost_nvme_ns { + struct spdk_bdev *bdev; + uint32_t block_size; + uint64_t capacity; + uint32_t nsid; + uint32_t active_ns; + struct spdk_bdev_desc *bdev_desc; + struct spdk_io_channel *bdev_io_channel; + struct spdk_nvme_ns_data nsdata; +}; + +struct spdk_vhost_nvme_task { + struct spdk_nvme_cmd cmd; + struct spdk_vhost_nvme_dev *nvme; + uint16_t sqid; + uint16_t cqid; + + /** array of iovecs to transfer. */ + struct iovec iovs[MAX_IOVS]; + + /** Number of iovecs in iovs array. */ + int iovcnt; + + /** Current iovec position. */ + int iovpos; + + /** Offset in current iovec. */ + uint32_t iov_offset; + + /* for bdev_io_wait */ + struct spdk_bdev_io_wait_entry bdev_io_wait; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_ns *ns; + + /* parent pointer. */ + struct spdk_vhost_nvme_task *parent; + uint8_t dnr; + uint8_t sct; + uint8_t sc; + uint32_t num_children; + STAILQ_ENTRY(spdk_vhost_nvme_task) stailq; +}; + +struct spdk_vhost_nvme_dev { + struct spdk_vhost_dev vdev; + + uint32_t num_io_queues; + union spdk_nvme_cap_register cap; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + struct spdk_nvme_ctrlr_data cdata; + + uint32_t num_sqs; + uint32_t num_cqs; + + uint32_t num_ns; + struct spdk_vhost_nvme_ns ns[MAX_NAMESPACE]; + + volatile uint32_t *dbbuf_dbs; + volatile uint32_t *dbbuf_eis; + struct spdk_vhost_nvme_sq sq_queue[MAX_IO_QUEUES + 1]; + struct spdk_vhost_nvme_cq cq_queue[MAX_IO_QUEUES + 1]; + + TAILQ_ENTRY(spdk_vhost_nvme_dev) tailq; + STAILQ_HEAD(, spdk_vhost_nvme_task) free_tasks; + struct spdk_poller *requestq_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; +}; + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend; + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static int +spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task); + +static struct spdk_vhost_nvme_dev * +to_nvme_dev(struct spdk_vhost_dev *vdev) +{ + if (vdev->backend != &spdk_vhost_nvme_device_backend) { + SPDK_ERRLOG("%s: not a vhost-nvme device\n", vdev->name); + return NULL; + } + + return SPDK_CONTAINEROF(vdev, struct spdk_vhost_nvme_dev, vdev); +} + +static TAILQ_HEAD(, spdk_vhost_nvme_dev) g_nvme_ctrlrs = TAILQ_HEAD_INITIALIZER(g_nvme_ctrlrs); + +static inline unsigned int sq_offset(unsigned int qid, uint32_t db_stride) +{ + return qid * 2 * db_stride; +} + +static inline unsigned int cq_offset(unsigned int qid, uint32_t db_stride) +{ + return (qid * 2 + 1) * db_stride; +} + +static void +nvme_inc_cq_head(struct spdk_vhost_nvme_cq *cq) +{ + cq->cq_head++; + if (cq->cq_head >= cq->size) { + cq->cq_head = 0; + cq->phase = !cq->phase; + } +} + +static bool +nvme_cq_is_full(struct spdk_vhost_nvme_cq *cq) +{ + return ((cq->cq_head + 1) % cq->size == cq->guest_signaled_cq_head); +} + +static void +nvme_inc_sq_head(struct spdk_vhost_nvme_sq *sq) +{ + sq->sq_head = (sq->sq_head + 1) % sq->size; +} + +static struct spdk_vhost_nvme_sq * +spdk_vhost_nvme_get_sq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->sq_queue[qid]; +} + +static struct spdk_vhost_nvme_cq * +spdk_vhost_nvme_get_cq_from_qid(struct spdk_vhost_nvme_dev *dev, uint16_t qid) +{ + if (spdk_unlikely(!qid || qid > MAX_IO_QUEUES)) { + return NULL; + } + + return &dev->cq_queue[qid]; +} + +static int +spdk_nvme_map_prps(struct spdk_vhost_nvme_dev *nvme, struct spdk_nvme_cmd *cmd, + struct spdk_vhost_nvme_task *task, uint32_t len) +{ + uint64_t prp1, prp2; + void *vva; + uint32_t i; + uint32_t residue_len, nents, mps = 4096; + uint64_t *prp_list; + + prp1 = cmd->dptr.prp.prp1; + prp2 = cmd->dptr.prp.prp2; + + /* PRP1 may started with unaligned page address */ + residue_len = mps - (prp1 % mps); + residue_len = spdk_min(len, residue_len); + + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp1, residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("GPA to VVA failed\n"); + return -1; + } + task->iovs[0].iov_base = vva; + task->iovs[0].iov_len = residue_len; + len -= residue_len; + + if (len) { + if (spdk_unlikely(prp2 == 0)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PRP2=0 in command\n"); + return -1; + } + + if (len <= mps) { + /* 2 PRP used */ + task->iovcnt = 2; + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, len); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + task->iovs[1].iov_base = vva; + task->iovs[1].iov_len = len; + } else { + /* PRP list used */ + nents = (len + mps - 1) / mps; + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp2, nents * sizeof(*prp_list)); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + prp_list = vva; + i = 0; + while (len != 0) { + residue_len = spdk_min(len, mps); + vva = spdk_vhost_gpa_to_vva(&nvme->vdev, prp_list[i], residue_len); + if (spdk_unlikely(vva == NULL)) { + return -1; + } + task->iovs[i + 1].iov_base = vva; + task->iovs[i + 1].iov_len = residue_len; + len -= residue_len; + i++; + } + task->iovcnt = i + 1; + } + } else { + /* 1 PRP used */ + task->iovcnt = 1; + } + + return 0; +} + +static void +spdk_nvme_cq_signal_fd(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_cq *cq; + uint32_t qid, cq_head; + + assert(nvme != NULL); + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq || !cq->valid) { + continue; + } + + cq_head = nvme->dbbuf_dbs[cq_offset(qid, 1)]; + if (cq->irq_enabled && cq->need_signaled_cnt && (cq->cq_head != cq_head)) { + eventfd_write(cq->virq, (eventfd_t)1); + cq->need_signaled_cnt = 0; + } + } +} + +static void +spdk_vhost_nvme_task_complete(struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_dev *nvme = task->nvme; + struct spdk_nvme_cpl cqe = {0}; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + struct spdk_nvme_cmd *cmd = &task->cmd; + uint16_t cqid = task->cqid; + uint16_t sqid = task->sqid; + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid); + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, sqid); + if (spdk_unlikely(!cq || !sq)) { + return; + } + + cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(cqid, 1)]; + if (spdk_unlikely(nvme_cq_is_full(cq))) { + STAILQ_INSERT_TAIL(&cq->cq_full_waited_tasks, task, stailq); + return; + } + + cqe.sqid = sqid; + cqe.sqhd = sq->sq_head; + cqe.cid = cmd->cid; + cqe.status.dnr = task->dnr; + cqe.status.sct = task->sct; + cqe.status.sc = task->sc; + cqe.status.p = !cq->phase; + cq->cq_cqe[cq->cq_head] = cqe; + spdk_smp_wmb(); + cq->cq_cqe[cq->cq_head].status.p = cq->phase; + + nvme_inc_cq_head(cq); + cq->need_signaled_cnt++; + + /* MMIO Controll */ + nvme->dbbuf_eis[cq_offset(cqid, 1)] = (uint32_t)(cq->guest_signaled_cq_head - 1); + + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); +} + +static void +blk_request_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *task = cb_arg; + struct spdk_nvme_cmd *cmd = &task->cmd; + int sc, sct; + + assert(bdev_io != NULL); + + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + spdk_bdev_free_io(bdev_io); + + task->dnr = !success; + task->sct = sct; + task->sc = sc; + + if (spdk_unlikely(!success)) { + SPDK_ERRLOG("I/O error, sector %u\n", cmd->cdw10); + } + + spdk_vhost_nvme_task_complete(task); +} + +static void +blk_unmap_complete_cb(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) +{ + struct spdk_vhost_nvme_task *child = cb_arg; + struct spdk_vhost_nvme_task *task = child->parent; + struct spdk_vhost_nvme_dev *nvme = task->nvme; + int sct, sc; + + assert(bdev_io != NULL); + + task->num_children--; + if (!success) { + task->dnr = 1; + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + task->sct = sct; + task->sc = sc; + } + + spdk_bdev_free_io(bdev_io); + + if (!task->num_children) { + spdk_vhost_nvme_task_complete(task); + } + + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); +} + +static struct spdk_vhost_nvme_ns * +spdk_vhost_nvme_get_ns_from_nsid(struct spdk_vhost_nvme_dev *dev, uint32_t nsid) +{ + if (spdk_unlikely(!nsid || nsid > dev->num_ns)) { + return NULL; + } + + return &dev->ns[nsid - 1]; +} + +static void +vhost_nvme_resubmit_task(void *arg) +{ + struct spdk_vhost_nvme_task *task = (struct spdk_vhost_nvme_task *)arg; + int rc; + + rc = spdk_nvme_process_sq(task->nvme, task->sq, task); + if (rc) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "vhost_nvme: task resubmit failed, rc = %d.\n", rc); + } +} + +static int +vhost_nvme_queue_task(struct spdk_vhost_nvme_task *task) +{ + int rc; + + task->bdev_io_wait.bdev = task->ns->bdev; + task->bdev_io_wait.cb_fn = vhost_nvme_resubmit_task; + task->bdev_io_wait.cb_arg = task; + + rc = spdk_bdev_queue_io_wait(task->ns->bdev, task->ns->bdev_io_channel, &task->bdev_io_wait); + if (rc != 0) { + SPDK_ERRLOG("Queue io failed in vhost_nvme_queue_task, rc=%d.\n", rc); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_vhost_nvme_task_complete(task); + } + + return rc; +} + +static int +spdk_nvme_process_sq(struct spdk_vhost_nvme_dev *nvme, struct spdk_vhost_nvme_sq *sq, + struct spdk_vhost_nvme_task *task) +{ + struct spdk_vhost_nvme_task *child; + struct spdk_nvme_cmd *cmd = &task->cmd; + struct spdk_vhost_nvme_ns *ns; + int ret = -1; + uint32_t len, nlba, block_size; + uint64_t slba; + struct spdk_nvme_dsm_range *range; + uint16_t i, num_ranges = 0; + + task->nvme = nvme; + task->dnr = 0; + task->sct = 0; + task->sc = 0; + + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, cmd->nsid); + if (spdk_unlikely(!ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + block_size = ns->block_size; + task->num_children = 0; + task->cqid = sq->cqid; + task->sqid = sq->sqid; + + task->ns = ns; + + if (spdk_unlikely(!ns->active_ns)) { + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + /* valid only for Read/Write commands */ + nlba = (cmd->cdw12 & 0xffff) + 1; + slba = cmd->cdw11; + slba = (slba << 32) | cmd->cdw10; + + if (cmd->opc == SPDK_NVME_OPC_READ || cmd->opc == SPDK_NVME_OPC_WRITE || + cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + if (cmd->psdt != SPDK_NVME_PSDT_PRP) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Invalid PSDT %u%ub in command\n", + cmd->psdt >> 1, cmd->psdt & 1u); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_vhost_nvme_task_complete(task); + return -1; + } + + if (cmd->opc == SPDK_NVME_OPC_DATASET_MANAGEMENT) { + num_ranges = (cmd->cdw10 & 0xff) + 1; + len = num_ranges * sizeof(struct spdk_nvme_dsm_range); + } else { + len = nlba * block_size; + } + + ret = spdk_nvme_map_prps(nvme, cmd, task, len); + if (spdk_unlikely(ret != 0)) { + SPDK_ERRLOG("nvme command map prps failed\n"); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_vhost_nvme_task_complete(task); + return -1; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + ret = spdk_bdev_readv(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_WRITE: + ret = spdk_bdev_writev(ns->bdev_desc, ns->bdev_io_channel, + task->iovs, task->iovcnt, slba * block_size, + nlba * block_size, blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_FLUSH: + ret = spdk_bdev_flush(ns->bdev_desc, ns->bdev_io_channel, + 0, ns->capacity, + blk_request_complete_cb, task); + break; + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + range = (struct spdk_nvme_dsm_range *)task->iovs[0].iov_base; + for (i = 0; i < num_ranges; i++) { + if (!STAILQ_EMPTY(&nvme->free_tasks)) { + child = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + SPDK_ERRLOG("No free task now\n"); + ret = -1; + break; + } + task->num_children++; + child->parent = task; + ret = spdk_bdev_unmap(ns->bdev_desc, ns->bdev_io_channel, + range[i].starting_lba * block_size, + range[i].length * block_size, + blk_unmap_complete_cb, child); + if (ret) { + STAILQ_INSERT_TAIL(&nvme->free_tasks, child, stailq); + break; + } + } + break; + default: + ret = -1; + break; + } + + if (spdk_unlikely(ret)) { + if (ret == -ENOMEM) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "No memory, start to queue io.\n"); + task->sq = sq; + ret = vhost_nvme_queue_task(task); + } else { + /* post error status to cqe */ + SPDK_ERRLOG("Error Submission For Command %u, ret %d\n", cmd->opc, ret); + task->dnr = 1; + task->sct = SPDK_NVME_SCT_GENERIC; + task->sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_vhost_nvme_task_complete(task); + } + } + + return ret; +} + +static int +nvme_worker(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = (struct spdk_vhost_nvme_dev *)arg; + struct spdk_vhost_nvme_sq *sq; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_task *task; + uint32_t qid, dbbuf_sq; + int ret; + int count = -1; + + if (spdk_unlikely(!nvme->num_sqs)) { + return -1; + } + + /* worker thread can't start before the admin doorbell + * buffer config command + */ + if (spdk_unlikely(!nvme->dbbuf_dbs)) { + return -1; + } + + for (qid = 1; qid <= MAX_IO_QUEUES; qid++) { + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq->valid) { + continue; + } + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, sq->cqid); + if (spdk_unlikely(!cq)) { + return -1; + } + cq->guest_signaled_cq_head = nvme->dbbuf_dbs[cq_offset(sq->cqid, 1)]; + if (spdk_unlikely(!STAILQ_EMPTY(&cq->cq_full_waited_tasks) && + !nvme_cq_is_full(cq))) { + task = STAILQ_FIRST(&cq->cq_full_waited_tasks); + STAILQ_REMOVE_HEAD(&cq->cq_full_waited_tasks, stailq); + spdk_vhost_nvme_task_complete(task); + } + + dbbuf_sq = nvme->dbbuf_dbs[sq_offset(qid, 1)]; + sq->sq_tail = (uint16_t)dbbuf_sq; + count = 0; + + while (sq->sq_head != sq->sq_tail) { + if (spdk_unlikely(!sq->sq_cmd)) { + break; + } + if (spdk_likely(!STAILQ_EMPTY(&nvme->free_tasks))) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + } else { + return -1; + } + + task->cmd = sq->sq_cmd[sq->sq_head]; + nvme_inc_sq_head(sq); + + /* processing IO */ + ret = spdk_nvme_process_sq(nvme, sq, task); + if (spdk_unlikely(ret)) { + SPDK_ERRLOG("QID %u CID %u, SQ HEAD %u, DBBUF SQ TAIL %u\n", qid, task->cmd.cid, sq->sq_head, + sq->sq_tail); + } + + /* MMIO Control */ + nvme->dbbuf_eis[sq_offset(qid, 1)] = (uint32_t)(sq->sq_head - 1); + + /* Maximum batch I/Os to pick up at once */ + if (count++ == MAX_BATCH_IO) { + break; + } + } + } + + /* Completion Queue */ + spdk_nvme_cq_signal_fd(nvme); + + return count; +} + +static int +vhost_nvme_doorbell_buffer_config(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint64_t dbs_dma_addr, eis_dma_addr; + + dbs_dma_addr = cmd->dptr.prp.prp1; + eis_dma_addr = cmd->dptr.prp.prp2; + + if ((dbs_dma_addr % 4096) || (eis_dma_addr % 4096)) { + return -1; + } + /* Guest Physical Address to Host Virtual Address */ + nvme->dbbuf_dbs = spdk_vhost_gpa_to_vva(&nvme->vdev, dbs_dma_addr, 4096); + nvme->dbbuf_eis = spdk_vhost_gpa_to_vva(&nvme->vdev, eis_dma_addr, 4096); + if (!nvme->dbbuf_dbs || !nvme->dbbuf_eis) { + return -1; + } + /* zeroed the doorbell buffer memory */ + memset((void *)nvme->dbbuf_dbs, 0, 4096); + memset((void *)nvme->dbbuf_eis, 0, 4096); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_create_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid, qsize, cqid; + uint64_t dma_addr; + uint64_t requested_len; + struct spdk_vhost_nvme_cq *cq; + struct spdk_vhost_nvme_sq *sq; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + cqid = (cmd->cdw11 >> 16) & 0xffff; + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, cqid); + if (!sq || !cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u or CQID %u\n", + qid, cqid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + + sq->sqid = qid; + sq->cqid = cqid; + sq->size = qsize + 1; + sq->sq_head = sq->sq_tail = 0; + requested_len = sizeof(struct spdk_nvme_cmd) * sq->size; + sq->sq_cmd = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len); + if (!sq->sq_cmd) { + return -1; + } + nvme->num_sqs++; + sq->valid = true; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_sq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_sq *sq; + + qid = cmd->cdw10 & 0xffff; + sq = spdk_vhost_nvme_get_sq_from_qid(nvme, qid); + if (!sq) { + return -1; + } + + /* We didn't see scenarios when deleting submission + * queue while I/O is running against the submisson + * queue for now, otherwise, we must ensure the poller + * will not run with this submission queue. + */ + nvme->num_sqs--; + sq->valid = false; + + memset(sq, 0, sizeof(*sq)); + sq->sq_cmd = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + + return 0; +} + +static int +vhost_nvme_create_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qsize, qid; + uint64_t dma_addr; + struct spdk_vhost_nvme_cq *cq; + uint64_t requested_len; + + /* physical contiguous */ + if (!(cmd->cdw11 & 0x1)) { + return -1; + } + + qid = cmd->cdw10 & 0xffff; + qsize = (cmd->cdw10 >> 16) & 0xffff; + dma_addr = cmd->dptr.prp.prp1; + if (!dma_addr || dma_addr % 4096) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "User requested invalid QID %u\n", qid); + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + cpl->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return -1; + } + cq->cqid = qid; + cq->size = qsize + 1; + cq->phase = 1; + cq->irq_enabled = (cmd->cdw11 >> 1) & 0x1; + /* Setup virq through vhost messages */ + cq->virq = -1; + cq->cq_head = 0; + cq->guest_signaled_cq_head = 0; + cq->need_signaled_cnt = 0; + requested_len = sizeof(struct spdk_nvme_cpl) * cq->size; + cq->cq_cqe = spdk_vhost_gpa_to_vva(&nvme->vdev, dma_addr, requested_len); + if (!cq->cq_cqe) { + return -1; + } + nvme->num_cqs++; + cq->valid = true; + STAILQ_INIT(&cq->cq_full_waited_tasks); + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static int +vhost_nvme_delete_io_cq(struct spdk_vhost_nvme_dev *nvme, + struct spdk_nvme_cmd *cmd, struct spdk_nvme_cpl *cpl) +{ + uint16_t qid; + struct spdk_vhost_nvme_cq *cq; + + qid = cmd->cdw10 & 0xffff; + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + nvme->num_cqs--; + cq->valid = false; + + memset(cq, 0, sizeof(*cq)); + cq->cq_cqe = NULL; + + cpl->status.sc = 0; + cpl->status.sct = 0; + return 0; +} + +static struct spdk_vhost_nvme_dev * +spdk_vhost_nvme_get_by_name(int vid) +{ + struct spdk_vhost_nvme_dev *nvme; + + TAILQ_FOREACH(nvme, &g_nvme_ctrlrs, tailq) { + if (nvme->vdev.vid == vid) { + return nvme; + } + } + + return NULL; +} + +int +spdk_vhost_nvme_get_cap(int vid, uint64_t *cap) +{ + struct spdk_vhost_nvme_dev *nvme; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + *cap = nvme->cap.raw; + return 0; +} + +int +spdk_vhost_nvme_admin_passthrough(int vid, void *cmd, void *cqe, void *buf) +{ + struct spdk_nvme_cmd *req = (struct spdk_nvme_cmd *)cmd; + struct spdk_nvme_cpl *cpl = (struct spdk_nvme_cpl *)cqe; + struct spdk_vhost_nvme_ns *ns; + int ret = 0; + struct spdk_vhost_nvme_dev *nvme; + uint32_t cq_head, sq_tail; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Admin Command Opcode %u\n", req->opc); + switch (req->opc) { + case SPDK_NVME_OPC_IDENTIFY: + if (req->cdw10 == SPDK_NVME_IDENTIFY_CTRLR) { + memcpy(buf, &nvme->cdata, sizeof(struct spdk_nvme_ctrlr_data)); + + } else if (req->cdw10 == SPDK_NVME_IDENTIFY_NS) { + ns = spdk_vhost_nvme_get_ns_from_nsid(nvme, req->nsid); + if (!ns) { + cpl->status.sc = SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE; + cpl->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + break; + } + memcpy(buf, &ns->nsdata, sizeof(struct spdk_nvme_ns_data)); + } + /* successfully */ + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + case SPDK_NVME_OPC_CREATE_IO_CQ: + ret = vhost_nvme_create_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_CQ: + ret = vhost_nvme_delete_io_cq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_CREATE_IO_SQ: + ret = vhost_nvme_create_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_DELETE_IO_SQ: + ret = vhost_nvme_delete_io_sq(nvme, req, cpl); + break; + case SPDK_NVME_OPC_GET_FEATURES: + case SPDK_NVME_OPC_SET_FEATURES: + if (req->cdw10 == SPDK_NVME_FEAT_NUMBER_OF_QUEUES) { + cpl->status.sc = 0; + cpl->status.sct = 0; + cpl->cdw0 = (nvme->num_io_queues - 1) | ((nvme->num_io_queues - 1) << 16); + } else { + cpl->status.sc = SPDK_NVME_SC_INVALID_FIELD; + cpl->status.sct = SPDK_NVME_SCT_GENERIC; + } + break; + case SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG: + ret = vhost_nvme_doorbell_buffer_config(nvme, req, cpl); + break; + case SPDK_NVME_OPC_ABORT: + sq_tail = nvme->dbbuf_dbs[sq_offset(1, 1)] & 0xffffu; + cq_head = nvme->dbbuf_dbs[cq_offset(1, 1)] & 0xffffu; + SPDK_NOTICELOG("ABORT: CID %u, SQ_TAIL %u, CQ_HEAD %u\n", + (req->cdw10 >> 16) & 0xffffu, sq_tail, cq_head); + /* TODO: ABORT failed fow now */ + cpl->cdw0 = 1; + cpl->status.sc = 0; + cpl->status.sct = 0; + break; + } + + if (ret) { + SPDK_ERRLOG("Admin Passthrough Faild with %u\n", req->opc); + } + + return 0; +} + +int +spdk_vhost_nvme_set_cq_call(int vid, uint16_t qid, int fd) +{ + struct spdk_vhost_nvme_dev *nvme; + struct spdk_vhost_nvme_cq *cq; + + nvme = spdk_vhost_nvme_get_by_name(vid); + if (!nvme) { + return -1; + } + + cq = spdk_vhost_nvme_get_cq_from_qid(nvme, qid); + if (!cq) { + return -1; + } + if (cq->irq_enabled) { + cq->virq = fd; + } else { + SPDK_ERRLOG("NVMe Qid %d Disabled IRQ\n", qid); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + struct spdk_vhost_nvme_task *task; + + while (!STAILQ_EMPTY(&nvme->free_tasks)) { + task = STAILQ_FIRST(&nvme->free_tasks); + STAILQ_REMOVE_HEAD(&nvme->free_tasks, stailq); + spdk_dma_free(task); + } +} + +static int +alloc_task_pool(struct spdk_vhost_nvme_dev *nvme) +{ + uint32_t entries, i; + struct spdk_vhost_nvme_task *task; + + entries = nvme->num_io_queues * MAX_QUEUE_ENTRIES_SUPPORTED; + + for (i = 0; i < entries; i++) { + task = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_task), + SPDK_CACHE_LINE_SIZE, NULL); + if (task == NULL) { + SPDK_ERRLOG("Controller %s alloc task pool failed\n", + nvme->vdev.name); + free_task_pool(nvme); + return -1; + } + STAILQ_INSERT_TAIL(&nvme->free_tasks, task, stailq); + } + + return 0; +} + +/* new device means enable the + * virtual NVMe controller + */ +static int +spdk_vhost_nvme_start_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return -1; + } + + if (alloc_task_pool(nvme)) { + return -1; + } + + SPDK_NOTICELOG("Start Device %u, Path %s, lcore %d\n", vdev->vid, + vdev->path, vdev->lcore); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + ns_dev->bdev_io_channel = spdk_bdev_get_io_channel(ns_dev->bdev_desc); + if (!ns_dev->bdev_io_channel) { + return -1; + } + } + + /* Start the NVMe Poller */ + nvme->requestq_poller = spdk_poller_register(nvme_worker, nvme, 0); + + spdk_vhost_dev_backend_event_done(event_ctx, 0); + return 0; +} + +static void +spdk_vhost_nvme_deactive_ns(struct spdk_vhost_nvme_ns *ns) +{ + ns->active_ns = 0; + spdk_bdev_close(ns->bdev_desc); + ns->bdev_desc = NULL; + ns->bdev = NULL; +} + +static void +bdev_remove_cb(void *remove_ctx) +{ + struct spdk_vhost_nvme_ns *ns = remove_ctx; + + SPDK_NOTICELOG("Removing NS %u, Block Device %s\n", + ns->nsid, spdk_bdev_get_name(ns->bdev)); + + spdk_vhost_nvme_deactive_ns(ns); +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_nvme_dev *nvme = arg; + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_NVME, "Destroy device poller callback\n"); + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (ns_dev->bdev_io_channel) { + spdk_put_io_channel(ns_dev->bdev_io_channel); + ns_dev->bdev_io_channel = NULL; + } + } + nvme->num_sqs = 0; + nvme->num_cqs = 0; + nvme->dbbuf_dbs = NULL; + nvme->dbbuf_eis = NULL; + } + } + + spdk_poller_unregister(&nvme->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(nvme->destroy_ctx.event_ctx, 0); + + return -1; +} + +/* Disable NVMe controller + */ +static int +spdk_vhost_nvme_stop_device(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + + if (nvme == NULL) { + return -1; + } + + free_task_pool(nvme); + SPDK_NOTICELOG("Stopping Device %u, Path %s\n", vdev->vid, vdev->path); + + nvme->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&nvme->requestq_poller); + nvme->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, nvme, 1000); + + return 0; +} + +static void +spdk_vhost_nvme_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_named_array_begin(w, "namespaces"); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_uint32(w, "nsid", ns_dev->nsid); + spdk_json_write_named_string(w, "bdev", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +spdk_vhost_nvme_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns_dev; + uint32_t i; + + if (nvme == NULL) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_nvme_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_uint32(w, "io_queues", nvme->num_io_queues); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(nvme->vdev.cpumask)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < nvme->num_ns; i++) { + ns_dev = &nvme->ns[i]; + if (!ns_dev->active_ns) { + continue; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "add_vhost_nvme_ns"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", nvme->vdev.name); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(ns_dev->bdev)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +static const struct spdk_vhost_dev_backend spdk_vhost_nvme_device_backend = { + .start_device = spdk_vhost_nvme_start_device, + .stop_device = spdk_vhost_nvme_stop_device, + .dump_info_json = spdk_vhost_nvme_dump_info_json, + .write_config_json = spdk_vhost_nvme_write_config_json, + .remove_device = spdk_vhost_nvme_dev_remove, +}; + +static int +spdk_vhost_nvme_ns_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + struct spdk_nvme_ns_data *nsdata; + uint64_t num_blocks; + uint32_t i; + + /* Identify Namespace */ + cdata->nn = dev->num_ns; + for (i = 0; i < dev->num_ns; i++) { + nsdata = &dev->ns[i].nsdata; + if (dev->ns[i].active_ns) { + num_blocks = spdk_bdev_get_num_blocks(dev->ns[i].bdev); + nsdata->nsze = num_blocks; + /* ncap must be non-zero for active Namespace */ + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(dev->ns[i].bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(dev->ns[i].bdev); + dev->ns[i].block_size = spdk_bdev_get_block_size(dev->ns[i].bdev); + dev->ns[i].capacity = num_blocks * dev->ns[i].block_size; + } else { + memset(nsdata, 0, sizeof(*nsdata)); + } + } + return 0; +} + +static int +spdk_vhost_nvme_ctrlr_identify_update(struct spdk_vhost_nvme_dev *dev) +{ + struct spdk_nvme_ctrlr_data *cdata = &dev->cdata; + char sn[20]; + + /* Controller Capabilities */ + dev->cap.bits.cqr = 1; + dev->cap.bits.to = 1; + dev->cap.bits.dstrd = 0; + dev->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + dev->cap.bits.mpsmin = 0; + dev->cap.bits.mpsmax = 0; + /* MQES is 0 based value */ + dev->cap.bits.mqes = MAX_QUEUE_ENTRIES_SUPPORTED - 1; + + /* Controller Configuration */ + dev->cc.bits.en = 0; + + /* Controller Status */ + dev->csts.bits.rdy = 0; + + /* Identify Controller */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + cdata->vid = 0x8086; + cdata->ssvid = 0x8086; + spdk_strcpy_pad(cdata->mn, "SPDK Virtual NVMe Controller", sizeof(cdata->mn), ' '); + snprintf(sn, sizeof(sn), "NVMe_%s", dev->vdev.name); + spdk_strcpy_pad(cdata->sn, sn, sizeof(cdata->sn), ' '); + cdata->ieee[0] = 0xe4; + cdata->ieee[1] = 0xd2; + cdata->ieee[2] = 0x5c; + cdata->ver.bits.mjr = 1; + cdata->ver.bits.mnr = 0; + cdata->mdts = 5; /* 128 KiB */ + cdata->rab = 6; + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->oncs.dsm = 1; + /* Emulated NVMe controller */ + cdata->oacs.doorbell_buffer_config = 1; + + spdk_vhost_nvme_ns_identify_update(dev); + + return 0; +} + +int +spdk_vhost_nvme_dev_construct(const char *name, const char *cpumask, uint32_t num_io_queues) +{ + struct spdk_vhost_nvme_dev *dev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_nvme_dev), + SPDK_CACHE_LINE_SIZE, NULL); + int rc; + + if (dev == NULL) { + return -ENOMEM; + } + + if (num_io_queues < 1 || num_io_queues > MAX_IO_QUEUES) { + spdk_dma_free(dev); + return -EINVAL; + } + + spdk_vhost_lock(); + rc = spdk_vhost_dev_register(&dev->vdev, name, cpumask, + &spdk_vhost_nvme_device_backend); + + if (rc) { + spdk_dma_free(dev); + spdk_vhost_unlock(); + return rc; + } + + dev->num_io_queues = num_io_queues; + STAILQ_INIT(&dev->free_tasks); + TAILQ_INSERT_TAIL(&g_nvme_ctrlrs, dev, tailq); + + spdk_vhost_nvme_ctrlr_identify_update(dev); + + SPDK_NOTICELOG("Controller %s: Constructed\n", name); + spdk_vhost_unlock(); + return rc; +} + +int +spdk_vhost_nvme_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_dev *dev, *tmp; + struct spdk_vhost_nvme_ns *ns; + int rc; + uint32_t i; + + if (nvme == NULL) { + return -EINVAL; + } + + TAILQ_FOREACH_SAFE(dev, &g_nvme_ctrlrs, tailq, tmp) { + if (dev == nvme) { + TAILQ_REMOVE(&g_nvme_ctrlrs, dev, tailq); + for (i = 0; i < nvme->num_ns; i++) { + ns = &nvme->ns[i]; + if (ns->active_ns) { + spdk_vhost_nvme_deactive_ns(ns); + } + } + } + } + + rc = spdk_vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + spdk_dma_free(nvme); + return 0; +} + +int +spdk_vhost_nvme_dev_add_ns(struct spdk_vhost_dev *vdev, const char *bdev_name) +{ + struct spdk_vhost_nvme_dev *nvme = to_nvme_dev(vdev); + struct spdk_vhost_nvme_ns *ns; + struct spdk_bdev *bdev; + int rc = -1; + + if (nvme == NULL) { + return -ENODEV; + } + + if (nvme->num_ns == MAX_NAMESPACE) { + SPDK_ERRLOG("Can't support %d Namespaces\n", nvme->num_ns); + return -ENOSPC; + } + + bdev = spdk_bdev_get_by_name(bdev_name); + if (!bdev) { + SPDK_ERRLOG("could not find bdev %s\n", bdev_name); + return -ENODEV; + } + + ns = &nvme->ns[nvme->num_ns]; + rc = spdk_bdev_open(bdev, true, bdev_remove_cb, ns, &nvme->ns[nvme->num_ns].bdev_desc); + if (rc != 0) { + SPDK_ERRLOG("Could not open bdev '%s', error=%d\n", + bdev_name, rc); + return rc; + } + + nvme->ns[nvme->num_ns].bdev = bdev; + nvme->ns[nvme->num_ns].active_ns = 1; + nvme->ns[nvme->num_ns].nsid = nvme->num_ns + 1; + nvme->num_ns++; + + spdk_vhost_nvme_ns_identify_update(nvme); + + return rc; +} + +int +spdk_vhost_nvme_controller_construct(void) +{ + struct spdk_conf_section *sp; + const char *name; + const char *bdev_name; + const char *cpumask; + int rc, i = 0; + struct spdk_vhost_dev *vdev; + uint32_t ctrlr_num, io_queues; + + for (sp = spdk_conf_first_section(NULL); sp != NULL; sp = spdk_conf_next_section(sp)) { + if (!spdk_conf_section_match_prefix(sp, "VhostNvme")) { + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostNvme%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + if (name == NULL) { + SPDK_ERRLOG("VhostNvme%u: missing Name\n", ctrlr_num); + return -1; + } + + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + rc = spdk_conf_section_get_intval(sp, "NumberOfQueues"); + if (rc > 0) { + io_queues = rc; + } else { + io_queues = 1; + } + + rc = spdk_vhost_nvme_dev_construct(name, cpumask, io_queues); + if (rc < 0) { + SPDK_ERRLOG("VhostNvme%u: Construct failed\n", ctrlr_num); + return -1; + } + + vdev = spdk_vhost_dev_find(name); + if (!vdev) { + return -1; + } + + for (i = 0; spdk_conf_section_get_nval(sp, "Namespace", i) != NULL; i++) { + bdev_name = spdk_conf_section_get_nmval(sp, "Namespace", i, 0); + if (!bdev_name) { + SPDK_ERRLOG("namespace configuration missing bdev name\n"); + break; + } + rc = spdk_vhost_nvme_dev_add_ns(vdev, bdev_name); + if (rc < 0) { + SPDK_WARNLOG("VhostNvme%u: Construct Namespace with %s failed\n", + ctrlr_num, bdev_name); + break; + } + } + } + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_nvme", SPDK_LOG_VHOST_NVME) diff --git a/src/spdk/lib/vhost/vhost_rpc.c b/src/spdk/lib/vhost/vhost_rpc.c new file mode 100644 index 00000000..0e546c36 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_rpc.c @@ -0,0 +1,814 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk_internal/log.h" +#include "spdk/rpc.h" +#include "spdk/util.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk/scsi.h" +#include "spdk/vhost.h" +#include "vhost_internal.h" +#include "spdk/bdev.h" + +struct rpc_vhost_scsi_ctrlr { + char *ctrlr; + char *cpumask; +}; + +static void +free_rpc_vhost_scsi_ctrlr(struct rpc_vhost_scsi_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static const struct spdk_json_object_decoder rpc_construct_vhost_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_scsi_ctrlr, ctrlr), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_scsi_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_construct_vhost_scsi_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_scsi_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_construct(req.ctrlr, req.cpumask); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_scsi_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_scsi_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("construct_vhost_scsi_controller", spdk_rpc_construct_vhost_scsi_controller, + SPDK_RPC_RUNTIME) + +struct rpc_add_vhost_scsi_ctrlr_lun { + char *ctrlr; + uint32_t scsi_target_num; + char *bdev_name; + + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_add_vhost_scsi_ctrlr_lun(struct rpc_add_vhost_scsi_ctrlr_lun *req) +{ + free(req->ctrlr); + free(req->bdev_name); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_add_lun[] = { + {"ctrlr", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, scsi_target_num), spdk_json_decode_uint32}, + {"bdev_name", offsetof(struct rpc_add_vhost_scsi_ctrlr_lun, bdev_name), spdk_json_decode_string }, +}; + +static int +spdk_rpc_add_vhost_scsi_lun_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_add_vhost_scsi_ctrlr_lun *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_add_tgt(vdev, rpc->scsi_target_num, rpc->bdev_name); + if (rc < 0) { + goto invalid; + } + + free_rpc_add_vhost_scsi_ctrlr_lun(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_add_vhost_scsi_ctrlr_lun(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_add_vhost_scsi_lun(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_vhost_scsi_ctrlr_lun *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_add_lun, + SPDK_COUNTOF(rpc_vhost_add_lun), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + if (req->ctrlr == NULL) { + SPDK_ERRLOG("No controller name\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_scsi_lun_cb, req); + + return; + +invalid: + if (req) { + free_rpc_add_vhost_scsi_ctrlr_lun(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("add_vhost_scsi_lun", spdk_rpc_add_vhost_scsi_lun, SPDK_RPC_RUNTIME) + +struct rpc_remove_vhost_scsi_ctrlr_target { + char *ctrlr; + uint32_t scsi_target_num; + + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_remove_vhost_scsi_ctrlr_target(struct rpc_remove_vhost_scsi_ctrlr_target *req) +{ + free(req->ctrlr); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_remove_target[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, ctrlr), spdk_json_decode_string }, + {"scsi_target_num", offsetof(struct rpc_remove_vhost_scsi_ctrlr_target, scsi_target_num), spdk_json_decode_uint32}, +}; + +static int +spdk_rpc_remove_vhost_scsi_target_finish_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + + free_rpc_remove_vhost_scsi_ctrlr_target(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; +} + +static int +spdk_rpc_remove_vhost_scsi_target_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, rpc->scsi_target_num, + spdk_rpc_remove_vhost_scsi_target_finish_cb, rpc); + if (rc < 0) { + goto invalid; + } + + return 0; + +invalid: + free_rpc_remove_vhost_scsi_ctrlr_target(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_remove_vhost_scsi_target(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_scsi_ctrlr_target *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_remove_target, + SPDK_COUNTOF(rpc_vhost_remove_target), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_scsi_target_cb, req); + + return; + +invalid: + if (req) { + free_rpc_remove_vhost_scsi_ctrlr_target(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} + +SPDK_RPC_REGISTER("remove_vhost_scsi_target", spdk_rpc_remove_vhost_scsi_target, SPDK_RPC_RUNTIME) + +struct rpc_vhost_blk_ctrlr { + char *ctrlr; + char *dev_name; + char *cpumask; + bool readonly; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_blk_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_blk_ctrlr, ctrlr), spdk_json_decode_string }, + {"dev_name", offsetof(struct rpc_vhost_blk_ctrlr, dev_name), spdk_json_decode_string }, + {"cpumask", offsetof(struct rpc_vhost_blk_ctrlr, cpumask), spdk_json_decode_string, true}, + {"readonly", offsetof(struct rpc_vhost_blk_ctrlr, readonly), spdk_json_decode_bool, true}, +}; + +static void +free_rpc_vhost_blk_ctrlr(struct rpc_vhost_blk_ctrlr *req) +{ + free(req->ctrlr); + free(req->dev_name); + free(req->cpumask); +} + +static void +spdk_rpc_construct_vhost_blk_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_blk_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_blk_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_blk_ctrlr), + &req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_blk_construct(req.ctrlr, req.cpumask, req.dev_name, req.readonly); + if (rc < 0) { + goto invalid; + } + + free_rpc_vhost_blk_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + free_rpc_vhost_blk_ctrlr(&req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("construct_vhost_blk_controller", spdk_rpc_construct_vhost_blk_controller, + SPDK_RPC_RUNTIME) + +struct rpc_remove_vhost_ctrlr { + char *ctrlr; + + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_remove_vhost_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_remove_vhost_ctrlr, ctrlr), spdk_json_decode_string }, +}; + +static void +free_rpc_remove_vhost_ctrlr(struct rpc_remove_vhost_ctrlr *req) +{ + free(req->ctrlr); + free(req); +} + +static int +spdk_rpc_remove_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_remove_vhost_ctrlr *ctx = arg; + struct spdk_jsonrpc_request *request = ctx->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_dev_remove(vdev); + if (rc < 0) { + goto invalid; + } + + free_rpc_remove_vhost_ctrlr(ctx); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return 0; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_remove_vhost_ctrlr(ctx); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return -1; +} + +static void +spdk_rpc_remove_vhost_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_remove_vhost_ctrlr *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_remove_vhost_ctrlr, + SPDK_COUNTOF(rpc_remove_vhost_ctrlr), req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_remove_vhost_controller_cb, req); + return; + +invalid: + if (req) { + free_rpc_remove_vhost_ctrlr(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("remove_vhost_controller", spdk_rpc_remove_vhost_controller, SPDK_RPC_RUNTIME) + +struct rpc_get_vhost_ctrlrs { + char *name; + struct spdk_json_write_ctx *w; + struct spdk_jsonrpc_request *request; +}; + +static void +_spdk_rpc_get_vhost_controller(struct spdk_json_write_ctx *w, struct spdk_vhost_dev *vdev) +{ + uint32_t delay_base_us, iops_threshold; + + spdk_vhost_get_coalescing(vdev, &delay_base_us, &iops_threshold); + + spdk_json_write_object_begin(w); + + spdk_json_write_named_string(w, "ctrlr", spdk_vhost_dev_get_name(vdev)); + spdk_json_write_named_string_fmt(w, "cpumask", "0x%s", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_named_uint32(w, "delay_base_us", delay_base_us); + spdk_json_write_named_uint32(w, "iops_threshold", iops_threshold); + spdk_json_write_named_string(w, "socket", vdev->path); + + spdk_json_write_named_object_begin(w, "backend_specific"); + spdk_vhost_dump_info_json(vdev, w); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); +} + +static int +spdk_rpc_get_vhost_controllers_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_get_vhost_ctrlrs *ctx = arg; + + assert(ctx->name == NULL); + + if (vdev == NULL) { + spdk_json_write_array_end(ctx->w); + spdk_jsonrpc_end_result(ctx->request, ctx->w); + free(ctx); + return 0; + } + + _spdk_rpc_get_vhost_controller(ctx->w, vdev); + return 0; +} + +static int +spdk_rpc_get_vhost_controller_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_get_vhost_ctrlrs *ctx = arg; + + assert(ctx->name != NULL); + + if (vdev == NULL) { + spdk_jsonrpc_send_error_response(ctx->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(ENODEV)); + goto free_name_ctx; + } + + ctx->w = spdk_jsonrpc_begin_result(ctx->request); + if (ctx->w == NULL) { + goto free_name_ctx; + } + + spdk_json_write_array_begin(ctx->w); + _spdk_rpc_get_vhost_controller(ctx->w, vdev); + spdk_json_write_array_end(ctx->w); + + spdk_jsonrpc_end_result(ctx->request, ctx->w); + +free_name_ctx: + free(ctx->name); + free(ctx); + return 0; +} + +static const struct spdk_json_object_decoder rpc_get_vhost_ctrlrs_decoders[] = { + {"name", offsetof(struct rpc_get_vhost_ctrlrs, name), spdk_json_decode_string, true}, +}; + +static void +spdk_rpc_get_vhost_controllers(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_get_vhost_ctrlrs *ctx; + struct spdk_json_write_ctx *w; + + ctx = calloc(1, sizeof(*ctx)); + if (ctx == NULL) { + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INTERNAL_ERROR, + spdk_strerror(ENOMEM)); + return; + } + + if (params && spdk_json_decode_object(params, rpc_get_vhost_ctrlrs_decoders, + SPDK_COUNTOF(rpc_get_vhost_ctrlrs_decoders), ctx)) { + SPDK_ERRLOG("spdk_json_decode_object failed\n"); + free(ctx); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + "Invalid parameters"); + return; + } + + if (ctx->name) { + ctx->request = request; + spdk_vhost_call_external_event(ctx->name, spdk_rpc_get_vhost_controller_cb, ctx); + return; + } + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + free(ctx); + return; + } + + spdk_json_write_array_begin(w); + + ctx->w = w; + ctx->request = request; + spdk_vhost_call_external_event_foreach(spdk_rpc_get_vhost_controllers_cb, ctx); +} +SPDK_RPC_REGISTER("get_vhost_controllers", spdk_rpc_get_vhost_controllers, SPDK_RPC_RUNTIME) + + +struct rpc_vhost_ctrlr_coalescing { + char *ctrlr; + uint32_t delay_base_us; + uint32_t iops_threshold; + struct spdk_jsonrpc_request *request; +}; + +static const struct spdk_json_object_decoder rpc_set_vhost_ctrlr_coalescing[] = { + {"ctrlr", offsetof(struct rpc_vhost_ctrlr_coalescing, ctrlr), spdk_json_decode_string }, + {"delay_base_us", offsetof(struct rpc_vhost_ctrlr_coalescing, delay_base_us), spdk_json_decode_uint32}, + {"iops_threshold", offsetof(struct rpc_vhost_ctrlr_coalescing, iops_threshold), spdk_json_decode_uint32}, +}; + +static void +free_rpc_set_vhost_controllers_event_coalescing(struct rpc_vhost_ctrlr_coalescing *req) +{ + if (!req) { + return; + } + + free(req->ctrlr); + free(req); +} + +static int +spdk_rpc_set_vhost_controller_coalescing_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_vhost_ctrlr_coalescing *req = arg; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_set_coalescing(vdev, req->delay_base_us, req->iops_threshold); + if (rc) { + goto invalid; + } + + w = spdk_jsonrpc_begin_result(req->request); + if (w != NULL) { + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(req->request, w); + } + + free_rpc_set_vhost_controllers_event_coalescing(req); + return 0; + +invalid: + spdk_jsonrpc_send_error_response(req->request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + free_rpc_set_vhost_controllers_event_coalescing(req); + return 0; +} + +static void +spdk_rpc_set_vhost_controller_coalescing(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_ctrlr_coalescing *req; + int rc; + + req = calloc(1, sizeof(struct rpc_vhost_ctrlr_coalescing)); + if (!req) { + rc = -ENOMEM; + goto invalid; + } + + if (spdk_json_decode_object(params, rpc_set_vhost_ctrlr_coalescing, + SPDK_COUNTOF(rpc_set_vhost_ctrlr_coalescing), req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + req->request = request; + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_set_vhost_controller_coalescing_cb, req); + return; + +invalid: + free_rpc_set_vhost_controllers_event_coalescing(req); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("set_vhost_controller_coalescing", spdk_rpc_set_vhost_controller_coalescing, + SPDK_RPC_RUNTIME) + +struct rpc_vhost_nvme_ctrlr { + char *ctrlr; + uint32_t io_queues; + char *cpumask; +}; + +static const struct spdk_json_object_decoder rpc_construct_vhost_nvme_ctrlr[] = { + {"ctrlr", offsetof(struct rpc_vhost_nvme_ctrlr, ctrlr), spdk_json_decode_string }, + {"io_queues", offsetof(struct rpc_vhost_nvme_ctrlr, io_queues), spdk_json_decode_uint32}, + {"cpumask", offsetof(struct rpc_vhost_nvme_ctrlr, cpumask), spdk_json_decode_string, true}, +}; + +static void +free_rpc_vhost_nvme_ctrlr(struct rpc_vhost_nvme_ctrlr *req) +{ + free(req->ctrlr); + free(req->cpumask); +} + +static void +spdk_rpc_construct_vhost_nvme_controller(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_vhost_nvme_ctrlr req = {0}; + struct spdk_json_write_ctx *w; + int rc; + + if (spdk_json_decode_object(params, rpc_construct_vhost_nvme_ctrlr, + SPDK_COUNTOF(rpc_construct_vhost_nvme_ctrlr), + &req)) { + rc = -EINVAL; + goto invalid; + } + + rc = spdk_vhost_nvme_dev_construct(req.ctrlr, req.cpumask, req.io_queues); + if (rc < 0) { + free_rpc_vhost_nvme_ctrlr(&req); + goto invalid; + } + + free_rpc_vhost_nvme_ctrlr(&req); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return; + +invalid: + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + +} +SPDK_RPC_REGISTER("construct_vhost_nvme_controller", spdk_rpc_construct_vhost_nvme_controller, + SPDK_RPC_RUNTIME) + +struct rpc_add_vhost_nvme_ctrlr_ns { + char *ctrlr; + char *bdev_name; + struct spdk_jsonrpc_request *request; +}; + +static void +free_rpc_add_vhost_nvme_ctrlr_ns(struct rpc_add_vhost_nvme_ctrlr_ns *req) +{ + free(req->ctrlr); + free(req->bdev_name); + free(req); +} + +static const struct spdk_json_object_decoder rpc_vhost_nvme_add_ns[] = { + {"ctrlr", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, ctrlr), spdk_json_decode_string }, + {"bdev_name", offsetof(struct rpc_add_vhost_nvme_ctrlr_ns, bdev_name), spdk_json_decode_string }, +}; + +static int +spdk_rpc_add_vhost_nvme_ns_cb(struct spdk_vhost_dev *vdev, void *arg) +{ + struct rpc_add_vhost_nvme_ctrlr_ns *rpc = arg; + struct spdk_jsonrpc_request *request = rpc->request; + struct spdk_json_write_ctx *w; + int rc; + + if (vdev == NULL) { + rc = -ENODEV; + goto invalid; + } + + rc = spdk_vhost_nvme_dev_add_ns(vdev, rpc->bdev_name); + if (rc < 0) { + goto invalid; + } + free_rpc_add_vhost_nvme_ctrlr_ns(rpc); + + w = spdk_jsonrpc_begin_result(request); + if (w == NULL) { + return -1; + } + + spdk_json_write_bool(w, true); + spdk_jsonrpc_end_result(request, w); + return 0; + +invalid: + free_rpc_add_vhost_nvme_ctrlr_ns(rpc); + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); + return rc; +} + +static void +spdk_rpc_add_vhost_nvme_ns(struct spdk_jsonrpc_request *request, + const struct spdk_json_val *params) +{ + struct rpc_add_vhost_nvme_ctrlr_ns *req; + int rc; + + req = calloc(1, sizeof(*req)); + if (req == NULL) { + rc = -ENOMEM; + goto invalid; + } + + req->request = request; + if (spdk_json_decode_object(params, rpc_vhost_nvme_add_ns, + SPDK_COUNTOF(rpc_vhost_nvme_add_ns), + req)) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_RPC, "spdk_json_decode_object failed\n"); + rc = -EINVAL; + goto invalid; + } + + spdk_vhost_call_external_event(req->ctrlr, spdk_rpc_add_vhost_nvme_ns_cb, req); + return; + +invalid: + if (req) { + free_rpc_add_vhost_nvme_ctrlr_ns(req); + } + spdk_jsonrpc_send_error_response(request, SPDK_JSONRPC_ERROR_INVALID_PARAMS, + spdk_strerror(-rc)); +} +SPDK_RPC_REGISTER("add_vhost_nvme_ns", spdk_rpc_add_vhost_nvme_ns, SPDK_RPC_RUNTIME) + + +SPDK_LOG_REGISTER_COMPONENT("vhost_rpc", SPDK_LOG_VHOST_RPC) diff --git a/src/spdk/lib/vhost/vhost_scsi.c b/src/spdk/lib/vhost/vhost_scsi.c new file mode 100644 index 00000000..aefa4c45 --- /dev/null +++ b/src/spdk/lib/vhost/vhost_scsi.c @@ -0,0 +1,1271 @@ +/*- + * BSD LICENSE + * + * Copyright(c) Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include + +#include "spdk/env.h" +#include "spdk/thread.h" +#include "spdk/scsi.h" +#include "spdk/scsi_spec.h" +#include "spdk/conf.h" +#include "spdk/event.h" +#include "spdk/util.h" +#include "spdk/likely.h" + +#include "spdk/vhost.h" +#include "vhost_internal.h" + +/* Features supported by SPDK VHOST lib. */ +#define SPDK_VHOST_SCSI_FEATURES (SPDK_VHOST_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_INOUT) | \ + (1ULL << VIRTIO_SCSI_F_HOTPLUG) | \ + (1ULL << VIRTIO_SCSI_F_CHANGE ) | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +/* Features that are specified in VIRTIO SCSI but currently not supported: + * - Live migration not supported yet + * - T10 PI + */ +#define SPDK_VHOST_SCSI_DISABLED_FEATURES (SPDK_VHOST_DISABLED_FEATURES | \ + (1ULL << VIRTIO_SCSI_F_T10_PI )) + +#define MGMT_POLL_PERIOD_US (1000 * 5) + +#define VIRTIO_SCSI_CONTROLQ 0 +#define VIRTIO_SCSI_EVENTQ 1 +#define VIRTIO_SCSI_REQUESTQ 2 + +struct spdk_scsi_dev_vhost_state { + bool removed; + spdk_vhost_event_fn remove_cb; + void *remove_ctx; +}; + +struct spdk_vhost_scsi_dev { + struct spdk_vhost_dev vdev; + struct spdk_scsi_dev *scsi_dev[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + struct spdk_scsi_dev_vhost_state scsi_dev_state[SPDK_VHOST_SCSI_CTRLR_MAX_DEVS]; + + struct spdk_poller *requestq_poller; + struct spdk_poller *mgmt_poller; + struct spdk_vhost_dev_destroy_ctx destroy_ctx; +} __rte_cache_aligned; + +struct spdk_vhost_scsi_task { + struct spdk_scsi_task scsi; + struct iovec iovs[SPDK_VHOST_IOVS_MAX]; + + union { + struct virtio_scsi_cmd_resp *resp; + struct virtio_scsi_ctrl_tmf_resp *tmf_resp; + }; + + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev *scsi_dev; + + /** Number of bytes that were written. */ + uint32_t used_len; + + int req_idx; + + /* If set, the task is currently used for I/O processing. */ + bool used; + + struct spdk_vhost_virtqueue *vq; +}; + +static int spdk_vhost_scsi_start(struct spdk_vhost_dev *, void *); +static int spdk_vhost_scsi_stop(struct spdk_vhost_dev *, void *); +static void spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static void spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, + struct spdk_json_write_ctx *w); +static int spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev); + +const struct spdk_vhost_dev_backend spdk_vhost_scsi_device_backend = { + .virtio_features = SPDK_VHOST_SCSI_FEATURES, + .disabled_features = SPDK_VHOST_SCSI_DISABLED_FEATURES, + .start_device = spdk_vhost_scsi_start, + .stop_device = spdk_vhost_scsi_stop, + .dump_info_json = spdk_vhost_scsi_dump_info_json, + .write_config_json = spdk_vhost_scsi_write_config_json, + .remove_device = spdk_vhost_scsi_dev_remove, +}; + +static void +spdk_vhost_scsi_task_put(struct spdk_vhost_scsi_task *task) +{ + spdk_scsi_task_put(&task->scsi); +} + +static void +spdk_vhost_scsi_task_free_cb(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + assert(task->svdev->vdev.task_cnt > 0); + task->svdev->vdev.task_cnt--; + task->used = false; +} + +static void +process_removed_devs(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_scsi_dev *dev; + struct spdk_scsi_dev_vhost_state *state; + int i; + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + dev = svdev->scsi_dev[i]; + state = &svdev->scsi_dev_state[i]; + + if (dev && state->removed && !spdk_scsi_dev_has_pending_tasks(dev)) { + spdk_scsi_dev_free_io_channels(dev); + svdev->scsi_dev[i] = NULL; + spdk_scsi_dev_destruct(dev); + if (state->remove_cb) { + state->remove_cb(&svdev->vdev, state->remove_ctx); + state->remove_cb = NULL; + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: hot-detached device 'Dev %u'.\n", + svdev->vdev.name, i); + } + } +} + +static void +eventq_enqueue(struct spdk_vhost_scsi_dev *svdev, unsigned scsi_dev_num, uint32_t event, + uint32_t reason) +{ + struct spdk_vhost_virtqueue *vq; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_event *desc_ev; + uint32_t desc_table_size, req_size = 0; + uint16_t req; + int rc; + + assert(scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + vq = &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ]; + + if (spdk_vhost_vq_avail_ring_get(vq, &req, 1) != 1) { + SPDK_ERRLOG("Controller %s: Failed to send virtio event (no avail ring entries?).\n", + svdev->vdev.name); + return; + } + + rc = spdk_vhost_vq_get_desc(&svdev->vdev, vq, req, &desc, &desc_table, &desc_table_size); + if (rc != 0 || desc->len < sizeof(*desc_ev)) { + SPDK_ERRLOG("Controller %s: Invalid eventq descriptor at index %"PRIu16".\n", + svdev->vdev.name, req); + goto out; + } + + desc_ev = spdk_vhost_gpa_to_vva(&svdev->vdev, desc->addr, sizeof(*desc_ev)); + if (desc_ev == NULL) { + SPDK_ERRLOG("Controller %s: Eventq descriptor at index %"PRIu16" points to unmapped guest memory address %p.\n", + svdev->vdev.name, req, (void *)(uintptr_t)desc->addr); + goto out; + } + + desc_ev->event = event; + desc_ev->lun[0] = 1; + desc_ev->lun[1] = scsi_dev_num; + /* virtio LUN id 0 can refer either to the entire device + * or actual LUN 0 (the only supported by vhost for now) + */ + desc_ev->lun[2] = 0 >> 8; + desc_ev->lun[3] = 0 & 0xFF; + /* virtio doesn't specify any strict format for LUN id (bytes 2 and 3) + * current implementation relies on linux kernel sources + */ + memset(&desc_ev->lun[4], 0, 4); + desc_ev->reason = reason; + req_size = sizeof(*desc_ev); + +out: + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, req, req_size); +} + +static void +submit_completion(struct spdk_vhost_scsi_task *task) +{ + spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx, + task->used_len); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Finished task (%p) req_idx=%d\n", task, task->req_idx); + + spdk_vhost_scsi_task_put(task); +} + +static void +spdk_vhost_scsi_task_mgmt_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + submit_completion(task); +} + +static void +spdk_vhost_scsi_task_cpl(struct spdk_scsi_task *scsi_task) +{ + struct spdk_vhost_scsi_task *task = SPDK_CONTAINEROF(scsi_task, struct spdk_vhost_scsi_task, scsi); + + /* The SCSI task has completed. Do final processing and then post + notification to the virtqueue's "used" ring. + */ + task->resp->status = task->scsi.status; + + if (task->scsi.status != SPDK_SCSI_STATUS_GOOD) { + memcpy(task->resp->sense, task->scsi.sense_data, task->scsi.sense_data_len); + task->resp->sense_len = task->scsi.sense_data_len; + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Task (%p) req_idx=%d failed - status=%u\n", task, task->req_idx, + task->scsi.status); + } + assert(task->scsi.transfer_len == task->scsi.length); + task->resp->resid = task->scsi.length - task->scsi.data_transferred; + + submit_completion(task); +} + +static void +task_submit(struct spdk_vhost_scsi_task *task) +{ + task->resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_task(task->scsi_dev, &task->scsi); +} + +static void +mgmt_task_submit(struct spdk_vhost_scsi_task *task, enum spdk_scsi_task_func func) +{ + task->tmf_resp->response = VIRTIO_SCSI_S_OK; + spdk_scsi_dev_queue_mgmt_task(task->scsi_dev, &task->scsi, func); +} + +static void +invalid_request(struct spdk_vhost_scsi_task *task) +{ + spdk_vhost_vq_used_ring_enqueue(&task->svdev->vdev, task->vq, task->req_idx, + task->used_len); + spdk_vhost_scsi_task_put(task); + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "Invalid request (status=%" PRIu8")\n", + task->resp ? task->resp->response : -1); +} + +static int +spdk_vhost_scsi_task_init_target(struct spdk_vhost_scsi_task *task, const __u8 *lun) +{ + struct spdk_scsi_dev *dev; + uint16_t lun_id = (((uint16_t)lun[2] << 8) | lun[3]) & 0x3FFF; + + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN", lun, 8); + + /* First byte must be 1 and second is target */ + if (lun[0] != 1 || lun[1] >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + return -1; + } + + dev = task->svdev->scsi_dev[lun[1]]; + task->scsi_dev = dev; + if (dev == NULL || task->svdev->scsi_dev_state[lun[1]].removed) { + /* If dev has been hotdetached, return 0 to allow sending + * additional hotremove event via sense codes. + */ + return task->svdev->scsi_dev_state[lun[1]].removed ? 0 : -1; + } + + task->scsi.target_port = spdk_scsi_dev_find_port_by_id(task->scsi_dev, 0); + task->scsi.lun = spdk_scsi_dev_get_lun(dev, lun_id); + return 0; +} + +static void +process_ctrl_request(struct spdk_vhost_scsi_task *task) +{ + struct spdk_vhost_dev *vdev = &task->svdev->vdev; + struct vring_desc *desc, *desc_table; + struct virtio_scsi_ctrl_tmf_req *ctrl_req; + struct virtio_scsi_ctrl_an_resp *an_resp; + uint32_t desc_table_size, used_len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_mgmt_cpl, spdk_vhost_scsi_task_free_cb); + rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_size); + if (spdk_unlikely(rc != 0)) { + SPDK_ERRLOG("%s: Invalid controlq descriptor at index %d.\n", + vdev->name, task->req_idx); + goto out; + } + + ctrl_req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*ctrl_req)); + if (ctrl_req == NULL) { + SPDK_ERRLOG("%s: Invalid task management request at index %d.\n", + vdev->name, task->req_idx); + goto out; + } + + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, + "Processing controlq descriptor: desc %d/%p, desc_addr %p, len %d, flags %d, last_used_idx %d; kickfd %d; size %d\n", + task->req_idx, desc, (void *)desc->addr, desc->len, desc->flags, task->vq->vring.last_used_idx, + task->vq->vring.kickfd, task->vq->vring.size); + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_QUEUE, "Request descriptor", (uint8_t *)ctrl_req, + desc->len); + + spdk_vhost_scsi_task_init_target(task, ctrl_req->lun); + + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_size); + if (spdk_unlikely(desc == NULL)) { + SPDK_ERRLOG("%s: No response descriptor for controlq request %d.\n", + vdev->name, task->req_idx); + goto out; + } + + /* Process the TMF request */ + switch (ctrl_req->type) { + case VIRTIO_SCSI_T_TMF: + task->tmf_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->tmf_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_tmf_resp) || task->tmf_resp == NULL)) { + SPDK_ERRLOG("%s: TMF response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto out; + } + + /* Check if we are processing a valid request */ + if (task->scsi_dev == NULL) { + task->tmf_resp->response = VIRTIO_SCSI_S_BAD_TARGET; + break; + } + + switch (ctrl_req->subtype) { + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + /* Handle LUN reset */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "LUN reset\n"); + + mgmt_task_submit(task, SPDK_SCSI_TASK_FUNC_LUN_RESET); + return; + default: + task->tmf_resp->response = VIRTIO_SCSI_S_ABORTED; + /* Unsupported command */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported TMF command %x\n", ctrl_req->subtype); + break; + } + break; + case VIRTIO_SCSI_T_AN_QUERY: + case VIRTIO_SCSI_T_AN_SUBSCRIBE: { + an_resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*an_resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_ctrl_an_resp) || an_resp == NULL)) { + SPDK_WARNLOG("%s: Asynchronous response descriptor points to invalid guest memory region\n", + vdev->name); + goto out; + } + + an_resp->response = VIRTIO_SCSI_S_ABORTED; + break; + } + default: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_QUEUE, "Unsupported control command %x\n", ctrl_req->type); + break; + } + + used_len = sizeof(struct virtio_scsi_ctrl_tmf_resp); +out: + spdk_vhost_vq_used_ring_enqueue(vdev, task->vq, task->req_idx, used_len); + spdk_vhost_scsi_task_put(task); +} + +/* + * Process task's descriptor chain and setup data related fields. + * Return + * -1 if request is invalid and must be aborted, + * 0 if all data are set. + */ +static int +task_data_setup(struct spdk_vhost_scsi_task *task, + struct virtio_scsi_cmd_req **req) +{ + struct spdk_vhost_dev *vdev = &task->svdev->vdev; + struct vring_desc *desc, *desc_table; + struct iovec *iovs = task->iovs; + uint16_t iovcnt = 0; + uint32_t desc_table_len, len = 0; + int rc; + + spdk_scsi_task_construct(&task->scsi, spdk_vhost_scsi_task_cpl, spdk_vhost_scsi_task_free_cb); + + rc = spdk_vhost_vq_get_desc(vdev, task->vq, task->req_idx, &desc, &desc_table, &desc_table_len); + /* First descriptor must be readable */ + if (spdk_unlikely(rc != 0 || spdk_vhost_vring_desc_is_wr(desc) || + desc->len < sizeof(struct virtio_scsi_cmd_req))) { + SPDK_WARNLOG("%s: invalid first (request) descriptor at index %"PRIu16".\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + *req = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(**req)); + if (spdk_unlikely(*req == NULL)) { + SPDK_WARNLOG("%s: Request descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + /* Each request must have at least 2 descriptors (e.g. request and response) */ + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (desc == NULL) { + SPDK_WARNLOG("%s: Descriptor chain at index %d contains neither payload nor response buffer.\n", + vdev->name, task->req_idx); + goto invalid_task; + } + task->scsi.dxfer_dir = spdk_vhost_vring_desc_is_wr(desc) ? SPDK_SCSI_DIR_FROM_DEV : + SPDK_SCSI_DIR_TO_DEV; + task->scsi.iovs = iovs; + + if (task->scsi.dxfer_dir == SPDK_SCSI_DIR_FROM_DEV) { + /* + * FROM_DEV (READ): [RD_req][WR_resp][WR_buf0]...[WR_bufN] + */ + task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid descriptor chain at request index %d (descriptor id overflow?).\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + if (desc == NULL) { + /* + * TEST UNIT READY command and some others might not contain any payload and this is not an error. + */ + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, + "No payload descriptors for FROM DEV command req_idx=%"PRIu16".\n", task->req_idx); + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "CDB=", (*req)->cdb, VIRTIO_SCSI_CDB_SIZE); + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + task->scsi.iovcnt = 1; + task->scsi.iovs[0].iov_len = 0; + task->scsi.length = 0; + task->scsi.transfer_len = 0; + return 0; + } + + /* All remaining descriptors are data. */ + while (desc) { + if (spdk_unlikely(!spdk_vhost_vring_desc_is_wr(desc))) { + SPDK_WARNLOG("FROM DEV cmd: descriptor nr %" PRIu16" in payload chain is read only.\n", iovcnt); + goto invalid_task; + } + + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + rc = spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(rc != 0)) { + SPDK_WARNLOG("%s: invalid payload in descriptor chain starting at index %d.\n", + vdev->name, task->req_idx); + goto invalid_task; + } + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp) + len; + } else { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "TO DEV"); + /* + * TO_DEV (WRITE):[RD_req][RD_buf0]...[RD_bufN][WR_resp] + * No need to check descriptor WR flag as this is done while setting scsi.dxfer_dir. + */ + + /* Process descriptors up to response. */ + while (!spdk_vhost_vring_desc_is_wr(desc)) { + if (spdk_unlikely(spdk_vhost_vring_desc_to_iov(vdev, iovs, &iovcnt, desc))) { + goto invalid_task; + } + len += desc->len; + + spdk_vhost_vring_desc_get_next(&desc, desc_table, desc_table_len); + if (spdk_unlikely(desc == NULL)) { + SPDK_WARNLOG("TO_DEV cmd: no response descriptor.\n"); + goto invalid_task; + } + } + + task->resp = spdk_vhost_gpa_to_vva(vdev, desc->addr, sizeof(*task->resp)); + if (spdk_unlikely(desc->len < sizeof(struct virtio_scsi_cmd_resp) || task->resp == NULL)) { + SPDK_WARNLOG("%s: Response descriptor at index %d points to invalid guest memory region\n", + vdev->name, task->req_idx); + goto invalid_task; + } + + task->used_len = sizeof(struct virtio_scsi_cmd_resp); + } + + task->scsi.iovcnt = iovcnt; + task->scsi.length = len; + task->scsi.transfer_len = len; + return 0; + +invalid_task: + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI_DATA, "%s: Invalid task at index %"PRIu16".\n", + vdev->name, task->req_idx); + return -1; +} + +static int +process_request(struct spdk_vhost_scsi_task *task) +{ + struct virtio_scsi_cmd_req *req; + int result; + + result = task_data_setup(task, &req); + if (result) { + return result; + } + + result = spdk_vhost_scsi_task_init_target(task, req->lun); + if (spdk_unlikely(result != 0)) { + task->resp->response = VIRTIO_SCSI_S_BAD_TARGET; + return -1; + } + + task->scsi.cdb = req->cdb; + SPDK_TRACEDUMP(SPDK_LOG_VHOST_SCSI_DATA, "request CDB", req->cdb, VIRTIO_SCSI_CDB_SIZE); + + if (spdk_unlikely(task->scsi.lun == NULL)) { + spdk_scsi_task_process_null_lun(&task->scsi); + task->resp->response = VIRTIO_SCSI_S_OK; + return 1; + } + + return 0; +} + +static void +process_controlq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_scsi_task *task; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + for (i = 0; i < reqs_cnt; i++) { + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' exceeds virtqueue size (%"PRIu16")\n", + svdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: invalid entry in avail ring. Buffer '%"PRIu16"' is still in use!\n", + svdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + svdev->vdev.task_cnt++; + memset(&task->scsi, 0, sizeof(task->scsi)); + task->tmf_resp = NULL; + task->used = true; + process_ctrl_request(task); + } +} + +static void +process_requestq(struct spdk_vhost_scsi_dev *svdev, struct spdk_vhost_virtqueue *vq) +{ + struct spdk_vhost_scsi_task *task; + uint16_t reqs[32]; + uint16_t reqs_cnt, i; + int result; + + reqs_cnt = spdk_vhost_vq_avail_ring_get(vq, reqs, SPDK_COUNTOF(reqs)); + assert(reqs_cnt <= 32); + + for (i = 0; i < reqs_cnt; i++) { + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Starting processing request idx %"PRIu16"======\n", + reqs[i]); + + if (spdk_unlikely(reqs[i] >= vq->vring.size)) { + SPDK_ERRLOG("%s: request idx '%"PRIu16"' exceeds virtqueue size (%"PRIu16").\n", + svdev->vdev.name, reqs[i], vq->vring.size); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[reqs[i]]; + if (spdk_unlikely(task->used)) { + SPDK_ERRLOG("%s: request with idx '%"PRIu16"' is already pending.\n", + svdev->vdev.name, reqs[i]); + spdk_vhost_vq_used_ring_enqueue(&svdev->vdev, vq, reqs[i], 0); + continue; + } + + svdev->vdev.task_cnt++; + memset(&task->scsi, 0, sizeof(task->scsi)); + task->resp = NULL; + task->used = true; + task->used_len = 0; + result = process_request(task); + if (likely(result == 0)) { + task_submit(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d submitted ======\n", task, + task->req_idx); + } else if (result > 0) { + spdk_vhost_scsi_task_cpl(&task->scsi); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d finished early ======\n", task, + task->req_idx); + } else { + invalid_request(task); + SPDK_DEBUGLOG(SPDK_LOG_VHOST_SCSI, "====== Task %p req_idx %d failed ======\n", task, + task->req_idx); + } + } +} + +static int +vdev_mgmt_worker(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + + process_removed_devs(svdev); + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_EVENTQ]); + + process_controlq(svdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]); + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[VIRTIO_SCSI_CONTROLQ]); + + return -1; +} + +static int +vdev_worker(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + uint32_t q_idx; + + for (q_idx = VIRTIO_SCSI_REQUESTQ; q_idx < svdev->vdev.max_queues; q_idx++) { + process_requestq(svdev, &svdev->vdev.virtqueue[q_idx]); + } + + spdk_vhost_dev_used_signal(&svdev->vdev); + + return -1; +} + +static struct spdk_vhost_scsi_dev * +to_scsi_dev(struct spdk_vhost_dev *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->backend != &spdk_vhost_scsi_device_backend) { + SPDK_ERRLOG("%s: not a vhost-scsi device.\n", ctrlr->name); + return NULL; + } + + return SPDK_CONTAINEROF(ctrlr, struct spdk_vhost_scsi_dev, vdev); +} + +int +spdk_vhost_scsi_dev_construct(const char *name, const char *cpumask) +{ + struct spdk_vhost_scsi_dev *svdev = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_dev), + SPDK_CACHE_LINE_SIZE, NULL); + int rc; + + if (svdev == NULL) { + return -ENOMEM; + } + + spdk_vhost_lock(); + rc = spdk_vhost_dev_register(&svdev->vdev, name, cpumask, + &spdk_vhost_scsi_device_backend); + + if (rc) { + spdk_dma_free(svdev); + } + + spdk_vhost_unlock(); + return rc; +} + +static int +spdk_vhost_scsi_dev_remove(struct spdk_vhost_dev *vdev) +{ + struct spdk_vhost_scsi_dev *svdev = to_scsi_dev(vdev); + int rc, i; + + if (svdev == NULL) { + return -EINVAL; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; ++i) { + if (svdev->scsi_dev[i]) { + if (vdev->registered) { + SPDK_ERRLOG("Trying to remove non-empty controller: %s.\n", vdev->name); + return -EBUSY; + } + + rc = spdk_vhost_scsi_dev_remove_tgt(vdev, i, NULL, NULL); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to force-remove target %d\n", vdev->name, i); + return rc; + } + } + } + + rc = spdk_vhost_dev_unregister(vdev); + if (rc != 0) { + return rc; + } + + spdk_dma_free(svdev); + return 0; +} + +struct spdk_scsi_dev * +spdk_vhost_scsi_dev_get_tgt(struct spdk_vhost_dev *vdev, uint8_t num) +{ + struct spdk_vhost_scsi_dev *svdev; + + assert(num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + svdev = to_scsi_dev(vdev); + + return svdev ? svdev->scsi_dev[num] : NULL; +} + +static void +spdk_vhost_scsi_lun_hotremove(const struct spdk_scsi_lun *lun, void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + const struct spdk_scsi_dev *scsi_dev; + unsigned scsi_dev_num; + + assert(lun != NULL); + assert(svdev != NULL); + if (svdev->vdev.lcore != -1 && + !spdk_vhost_dev_has_feature(&svdev->vdev, VIRTIO_SCSI_F_HOTPLUG)) { + SPDK_WARNLOG("%s: hotremove is not enabled for this controller.\n", svdev->vdev.name); + return; + } + + scsi_dev = spdk_scsi_lun_get_dev(lun); + for (scsi_dev_num = 0; scsi_dev_num < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; scsi_dev_num++) { + if (svdev->scsi_dev[scsi_dev_num] == scsi_dev) { + break; + } + } + + if (scsi_dev_num == SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + /* The entire device has been already removed. */ + return; + } + + /* remove entire device */ + spdk_vhost_scsi_dev_remove_tgt(&svdev->vdev, scsi_dev_num, NULL, NULL); +} + +int +spdk_vhost_scsi_dev_add_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + const char *bdev_name) +{ + struct spdk_vhost_scsi_dev *svdev; + char target_name[SPDK_SCSI_DEV_MAX_NAME]; + int lun_id_list[1]; + const char *bdev_names_list[1]; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + return -EINVAL; + } + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("Controller %d target number too big (max %d)\n", scsi_tgt_num, + SPDK_VHOST_SCSI_CTRLR_MAX_DEVS); + return -EINVAL; + } + + if (bdev_name == NULL) { + SPDK_ERRLOG("No lun name specified\n"); + return -EINVAL; + } + + if (svdev->scsi_dev[scsi_tgt_num] != NULL) { + SPDK_ERRLOG("Controller %s target %u already occupied\n", vdev->name, scsi_tgt_num); + return -EEXIST; + } + + /* + * At this stage only one LUN per target + */ + snprintf(target_name, sizeof(target_name), "Target %u", scsi_tgt_num); + lun_id_list[0] = 0; + bdev_names_list[0] = (char *)bdev_name; + + svdev->scsi_dev_state[scsi_tgt_num].removed = false; + svdev->scsi_dev[scsi_tgt_num] = spdk_scsi_dev_construct(target_name, bdev_names_list, lun_id_list, + 1, + SPDK_SPC_PROTOCOL_IDENTIFIER_SAS, spdk_vhost_scsi_lun_hotremove, svdev); + + if (svdev->scsi_dev[scsi_tgt_num] == NULL) { + SPDK_ERRLOG("Couldn't create spdk SCSI target '%s' using bdev '%s' in controller: %s\n", + target_name, bdev_name, vdev->name); + return -EINVAL; + } + spdk_scsi_dev_add_port(svdev->scsi_dev[scsi_tgt_num], 0, "vhost"); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Controller %s: defined target '%s' using bdev '%s'\n", + vdev->name, target_name, bdev_name); + + if (vdev->lcore == -1) { + /* All done. */ + return 0; + } + + spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[scsi_tgt_num]); + + if (spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { + eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, + VIRTIO_SCSI_EVT_RESET_RESCAN); + } else { + SPDK_NOTICELOG("Device %s does not support hotplug. " + "Please restart the driver or perform a rescan.\n", + vdev->name); + } + + return 0; +} + +int +spdk_vhost_scsi_dev_remove_tgt(struct spdk_vhost_dev *vdev, unsigned scsi_tgt_num, + spdk_vhost_event_fn cb_fn, void *cb_arg) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_dev *scsi_dev; + struct spdk_scsi_dev_vhost_state *scsi_dev_state; + int rc = 0; + + if (scsi_tgt_num >= SPDK_VHOST_SCSI_CTRLR_MAX_DEVS) { + SPDK_ERRLOG("%s: invalid target number %d\n", vdev->name, scsi_tgt_num); + return -EINVAL; + } + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + return -ENODEV; + } + + scsi_dev = svdev->scsi_dev[scsi_tgt_num]; + if (scsi_dev == NULL) { + SPDK_ERRLOG("Controller %s target %u is not occupied\n", vdev->name, scsi_tgt_num); + return -ENODEV; + } + + if (svdev->vdev.lcore == -1) { + /* controller is not in use, remove dev and exit */ + svdev->scsi_dev[scsi_tgt_num] = NULL; + spdk_scsi_dev_destruct(scsi_dev); + if (cb_fn) { + rc = cb_fn(vdev, cb_arg); + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: removed target 'Target %u'\n", + vdev->name, scsi_tgt_num); + return rc; + } + + if (!spdk_vhost_dev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) { + SPDK_WARNLOG("%s: 'Target %u' is in use and hot-detach is not enabled for this controller.\n", + svdev->vdev.name, scsi_tgt_num); + return -ENOTSUP; + } + + scsi_dev_state = &svdev->scsi_dev_state[scsi_tgt_num]; + if (scsi_dev_state->removed) { + SPDK_WARNLOG("%s: 'Target %u' has been already marked to hotremove.\n", svdev->vdev.name, + scsi_tgt_num); + return -EBUSY; + } + + scsi_dev_state->remove_cb = cb_fn; + scsi_dev_state->remove_ctx = cb_arg; + scsi_dev_state->removed = true; + eventq_enqueue(svdev, scsi_tgt_num, VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED); + + SPDK_INFOLOG(SPDK_LOG_VHOST, "%s: queued 'Target %u' for hot-detach.\n", vdev->name, scsi_tgt_num); + return 0; +} + +int +spdk_vhost_scsi_controller_construct(void) +{ + struct spdk_conf_section *sp = spdk_conf_first_section(NULL); + struct spdk_vhost_dev *vdev; + int i, dev_num; + unsigned ctrlr_num = 0; + char *bdev_name, *tgt_num_str; + char *cpumask; + char *name; + char *tgt = NULL; + + while (sp != NULL) { + if (!spdk_conf_section_match_prefix(sp, "VhostScsi")) { + sp = spdk_conf_next_section(sp); + continue; + } + + if (sscanf(spdk_conf_section_get_name(sp), "VhostScsi%u", &ctrlr_num) != 1) { + SPDK_ERRLOG("Section '%s' has non-numeric suffix.\n", + spdk_conf_section_get_name(sp)); + return -1; + } + + name = spdk_conf_section_get_val(sp, "Name"); + cpumask = spdk_conf_section_get_val(sp, "Cpumask"); + + if (spdk_vhost_scsi_dev_construct(name, cpumask) < 0) { + return -1; + } + + vdev = spdk_vhost_dev_find(name); + assert(vdev); + + for (i = 0; ; i++) { + + tgt = spdk_conf_section_get_nval(sp, "Target", i); + if (tgt == NULL) { + break; + } + + tgt_num_str = spdk_conf_section_get_nmval(sp, "Target", i, 0); + if (tgt_num_str == NULL) { + SPDK_ERRLOG("%s: Invalid or missing target number\n", name); + return -1; + } + + dev_num = (int)strtol(tgt_num_str, NULL, 10); + bdev_name = spdk_conf_section_get_nmval(sp, "Target", i, 1); + if (bdev_name == NULL) { + SPDK_ERRLOG("%s: Invalid or missing bdev name for target %d\n", name, dev_num); + return -1; + } else if (spdk_conf_section_get_nmval(sp, "Target", i, 2)) { + SPDK_ERRLOG("%s: Only one LUN per vhost SCSI device supported\n", name); + return -1; + } + + if (spdk_vhost_scsi_dev_add_tgt(vdev, dev_num, bdev_name) < 0) { + return -1; + } + } + + sp = spdk_conf_next_section(sp); + } + + return 0; +} + +static void +free_task_pool(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_vhost_virtqueue *vq; + uint16_t i; + + for (i = 0; i < svdev->vdev.max_queues; i++) { + vq = &svdev->vdev.virtqueue[i]; + if (vq->tasks == NULL) { + continue; + } + + spdk_dma_free(vq->tasks); + vq->tasks = NULL; + } +} + +static int +alloc_task_pool(struct spdk_vhost_scsi_dev *svdev) +{ + struct spdk_vhost_virtqueue *vq; + struct spdk_vhost_scsi_task *task; + uint32_t task_cnt; + uint16_t i; + uint32_t j; + + for (i = 0; i < svdev->vdev.max_queues; i++) { + vq = &svdev->vdev.virtqueue[i]; + if (vq->vring.desc == NULL) { + continue; + } + + task_cnt = vq->vring.size; + if (task_cnt > SPDK_VHOST_MAX_VQ_SIZE) { + /* sanity check */ + SPDK_ERRLOG("Controller %s: virtuque %"PRIu16" is too big. (size = %"PRIu32", max = %"PRIu32")\n", + svdev->vdev.name, i, task_cnt, SPDK_VHOST_MAX_VQ_SIZE); + free_task_pool(svdev); + return -1; + } + vq->tasks = spdk_dma_zmalloc(sizeof(struct spdk_vhost_scsi_task) * task_cnt, + SPDK_CACHE_LINE_SIZE, NULL); + if (vq->tasks == NULL) { + SPDK_ERRLOG("Controller %s: failed to allocate %"PRIu32" tasks for virtqueue %"PRIu16"\n", + svdev->vdev.name, task_cnt, i); + free_task_pool(svdev); + return -1; + } + + for (j = 0; j < task_cnt; j++) { + task = &((struct spdk_vhost_scsi_task *)vq->tasks)[j]; + task->svdev = svdev; + task->vq = vq; + task->req_idx = j; + } + } + + return 0; +} + +/* + * A new device is added to a data core. First the device is added to the main linked list + * and then allocated to a specific data core. + */ +static int +spdk_vhost_scsi_start(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_scsi_dev *svdev; + uint32_t i; + int rc; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + SPDK_ERRLOG("Trying to start non-scsi controller as a scsi one.\n"); + rc = -1; + goto out; + } + + /* validate all I/O queues are in a contiguous index range */ + for (i = VIRTIO_SCSI_REQUESTQ; i < vdev->max_queues; i++) { + if (vdev->virtqueue[i].vring.desc == NULL) { + SPDK_ERRLOG("%s: queue %"PRIu32" is empty\n", vdev->name, i); + rc = -1; + goto out; + } + } + + rc = alloc_task_pool(svdev); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to alloc task pool.\n", vdev->name); + goto out; + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (svdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_allocate_io_channels(svdev->scsi_dev[i]); + } + SPDK_INFOLOG(SPDK_LOG_VHOST, "Started poller for vhost controller %s on lcore %d\n", + vdev->name, vdev->lcore); + + svdev->requestq_poller = spdk_poller_register(vdev_worker, svdev, 0); + if (vdev->virtqueue[VIRTIO_SCSI_CONTROLQ].vring.desc && + vdev->virtqueue[VIRTIO_SCSI_EVENTQ].vring.desc) { + svdev->mgmt_poller = spdk_poller_register(vdev_mgmt_worker, svdev, + MGMT_POLL_PERIOD_US); + } +out: + spdk_vhost_dev_backend_event_done(event_ctx, rc); + return rc; +} + +static int +destroy_device_poller_cb(void *arg) +{ + struct spdk_vhost_scsi_dev *svdev = arg; + uint32_t i; + + if (svdev->vdev.task_cnt > 0) { + return -1; + } + + + for (i = 0; i < svdev->vdev.max_queues; i++) { + spdk_vhost_vq_used_signal(&svdev->vdev, &svdev->vdev.virtqueue[i]); + } + + for (i = 0; i < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; i++) { + if (svdev->scsi_dev[i] == NULL) { + continue; + } + spdk_scsi_dev_free_io_channels(svdev->scsi_dev[i]); + } + + SPDK_INFOLOG(SPDK_LOG_VHOST, "Stopping poller for vhost controller %s\n", svdev->vdev.name); + + free_task_pool(svdev); + + spdk_poller_unregister(&svdev->destroy_ctx.poller); + spdk_vhost_dev_backend_event_done(svdev->destroy_ctx.event_ctx, 0); + + return -1; +} + +static int +spdk_vhost_scsi_stop(struct spdk_vhost_dev *vdev, void *event_ctx) +{ + struct spdk_vhost_scsi_dev *svdev; + + svdev = to_scsi_dev(vdev); + if (svdev == NULL) { + SPDK_ERRLOG("Trying to stop non-scsi controller as a scsi one.\n"); + goto err; + } + + svdev->destroy_ctx.event_ctx = event_ctx; + spdk_poller_unregister(&svdev->requestq_poller); + spdk_poller_unregister(&svdev->mgmt_poller); + svdev->destroy_ctx.poller = spdk_poller_register(destroy_device_poller_cb, svdev, + 1000); + + return 0; + +err: + spdk_vhost_dev_backend_event_done(event_ctx, -1); + return -1; +} + +static void +spdk_vhost_scsi_dump_info_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_scsi_dev *sdev; + struct spdk_scsi_lun *lun; + uint32_t dev_idx; + uint32_t lun_idx; + + assert(vdev != NULL); + spdk_json_write_name(w, "scsi"); + spdk_json_write_array_begin(w); + for (dev_idx = 0; dev_idx < SPDK_VHOST_SCSI_CTRLR_MAX_DEVS; dev_idx++) { + sdev = spdk_vhost_scsi_dev_get_tgt(vdev, dev_idx); + if (!sdev) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "scsi_dev_num"); + spdk_json_write_uint32(w, dev_idx); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, spdk_scsi_dev_get_id(sdev)); + + spdk_json_write_name(w, "target_name"); + spdk_json_write_string(w, spdk_scsi_dev_get_name(sdev)); + + spdk_json_write_name(w, "luns"); + spdk_json_write_array_begin(w); + + for (lun_idx = 0; lun_idx < SPDK_SCSI_DEV_MAX_LUN; lun_idx++) { + lun = spdk_scsi_dev_get_lun(sdev, lun_idx); + if (!lun) { + continue; + } + + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "id"); + spdk_json_write_int32(w, spdk_scsi_lun_get_id(lun)); + + spdk_json_write_name(w, "bdev_name"); + spdk_json_write_string(w, spdk_scsi_lun_get_bdev_name(lun)); + + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); + spdk_json_write_object_end(w); + } + + spdk_json_write_array_end(w); +} + +static void +spdk_vhost_scsi_write_config_json(struct spdk_vhost_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct spdk_vhost_scsi_dev *svdev; + struct spdk_scsi_lun *lun; + uint32_t i; + + svdev = to_scsi_dev(vdev); + if (!svdev) { + return; + } + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "construct_vhost_scsi_controller"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_string(w, "cpumask", spdk_cpuset_fmt(vdev->cpumask)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + for (i = 0; i < SPDK_COUNTOF(svdev->scsi_dev); i++) { + if (svdev->scsi_dev[i] == NULL || svdev->scsi_dev_state[i].removed) { + continue; + } + + lun = spdk_scsi_dev_get_lun(svdev->scsi_dev[i], 0); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "add_vhost_scsi_lun"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "ctrlr", vdev->name); + spdk_json_write_named_uint32(w, "scsi_target_num", i); + + spdk_json_write_named_string(w, "bdev_name", spdk_scsi_lun_get_bdev_name(lun)); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } +} + +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi", SPDK_LOG_VHOST_SCSI) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_queue", SPDK_LOG_VHOST_SCSI_QUEUE) +SPDK_LOG_REGISTER_COMPONENT("vhost_scsi_data", SPDK_LOG_VHOST_SCSI_DATA) diff --git a/src/spdk/lib/virtio/Makefile b/src/spdk/lib/virtio/Makefile new file mode 100644 index 00000000..db61c1f2 --- /dev/null +++ b/src/spdk/lib/virtio/Makefile @@ -0,0 +1,42 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +CFLAGS += $(ENV_CFLAGS) +C_SRCS = virtio.c virtio_user.c virtio_pci.c +C_SRCS += virtio_user/vhost_user.c +LIBNAME = virtio + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/virtio/virtio.c b/src/spdk/lib/virtio/virtio.c new file mode 100644 index 00000000..b03034cf --- /dev/null +++ b/src/spdk/lib/virtio/virtio.c @@ -0,0 +1,738 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "spdk/env.h" +#include "spdk/barrier.h" + +#include "spdk_internal/virtio.h" + +/* We use SMP memory barrier variants as all virtio_pci devices + * are purely virtual. All MMIO is executed on a CPU core, so + * there's no need to do full MMIO synchronization. + */ +#define virtio_mb() spdk_smp_mb() +#define virtio_rmb() spdk_smp_rmb() +#define virtio_wmb() spdk_smp_wmb() + +/* Chain all the descriptors in the ring with an END */ +static inline void +vring_desc_init(struct vring_desc *dp, uint16_t n) +{ + uint16_t i; + + for (i = 0; i < n - 1; i++) { + dp[i].next = (uint16_t)(i + 1); + } + dp[i].next = VQ_RING_DESC_CHAIN_END; +} + +static void +virtio_init_vring(struct virtqueue *vq) +{ + int size = vq->vq_nentries; + struct vring *vr = &vq->vq_ring; + uint8_t *ring_mem = vq->vq_ring_virt_mem; + + /* + * Reinitialise since virtio port might have been stopped and restarted + */ + memset(ring_mem, 0, vq->vq_ring_size); + vring_init(vr, size, ring_mem, VIRTIO_PCI_VRING_ALIGN); + vq->vq_used_cons_idx = 0; + vq->vq_desc_head_idx = 0; + vq->vq_avail_idx = 0; + vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); + vq->vq_free_cnt = vq->vq_nentries; + vq->req_start = VQ_RING_DESC_CHAIN_END; + vq->req_end = VQ_RING_DESC_CHAIN_END; + vq->reqs_finished = 0; + memset(vq->vq_descx, 0, sizeof(struct vq_desc_extra) * vq->vq_nentries); + + vring_desc_init(vr->desc, size); + + /* Tell the backend not to interrupt us. + * If F_EVENT_IDX is negotiated, we will always set incredibly high + * used event idx, so that we will practically never receive an + * interrupt. See virtqueue_req_flush() + */ + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + vring_used_event(&vq->vq_ring) = UINT16_MAX; + } else { + vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + } +} + +static int +virtio_init_queue(struct virtio_dev *dev, uint16_t vtpci_queue_idx) +{ + unsigned int vq_size, size; + struct virtqueue *vq; + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "setting up queue: %"PRIu16"\n", vtpci_queue_idx); + + /* + * Read the virtqueue size from the Queue Size field + * Always power of 2 and if 0 virtqueue does not exist + */ + vq_size = virtio_dev_backend_ops(dev)->get_queue_size(dev, vtpci_queue_idx); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq_size: %u\n", vq_size); + if (vq_size == 0) { + SPDK_ERRLOG("virtqueue %"PRIu16" does not exist\n", vtpci_queue_idx); + return -EINVAL; + } + + if (!rte_is_power_of_2(vq_size)) { + SPDK_ERRLOG("virtqueue %"PRIu16" size (%u) is not powerof 2\n", + vtpci_queue_idx, vq_size); + return -EINVAL; + } + + size = RTE_ALIGN_CEIL(sizeof(*vq) + + vq_size * sizeof(struct vq_desc_extra), + RTE_CACHE_LINE_SIZE); + + vq = spdk_dma_zmalloc(size, RTE_CACHE_LINE_SIZE, NULL); + if (vq == NULL) { + SPDK_ERRLOG("can not allocate vq\n"); + return -ENOMEM; + } + dev->vqs[vtpci_queue_idx] = vq; + + vq->vdev = dev; + vq->vq_queue_index = vtpci_queue_idx; + vq->vq_nentries = vq_size; + + /* + * Reserve a memzone for vring elements + */ + size = vring_size(vq_size, VIRTIO_PCI_VRING_ALIGN); + vq->vq_ring_size = RTE_ALIGN_CEIL(size, VIRTIO_PCI_VRING_ALIGN); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vring_size: %u, rounded_vring_size: %u\n", + size, vq->vq_ring_size); + + vq->owner_thread = NULL; + + rc = virtio_dev_backend_ops(dev)->setup_queue(dev, vq); + if (rc < 0) { + SPDK_ERRLOG("setup_queue failed\n"); + spdk_dma_free(vq); + dev->vqs[vtpci_queue_idx] = NULL; + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_mem: 0x%" PRIx64 "\n", + vq->vq_ring_mem); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "vq->vq_ring_virt_mem: 0x%" PRIx64 "\n", + (uint64_t)(uintptr_t)vq->vq_ring_virt_mem); + + virtio_init_vring(vq); + return 0; +} + +static void +virtio_free_queues(struct virtio_dev *dev) +{ + uint16_t nr_vq = dev->max_queues; + struct virtqueue *vq; + uint16_t i; + + if (dev->vqs == NULL) { + return; + } + + for (i = 0; i < nr_vq; i++) { + vq = dev->vqs[i]; + if (!vq) { + continue; + } + + virtio_dev_backend_ops(dev)->del_queue(dev, vq); + + rte_free(vq); + dev->vqs[i] = NULL; + } + + rte_free(dev->vqs); + dev->vqs = NULL; +} + +static int +virtio_alloc_queues(struct virtio_dev *dev, uint16_t request_vq_num, uint16_t fixed_vq_num) +{ + uint16_t nr_vq; + uint16_t i; + int ret; + + nr_vq = request_vq_num + fixed_vq_num; + if (nr_vq == 0) { + /* perfectly fine to have a device with no virtqueues. */ + return 0; + } + + assert(dev->vqs == NULL); + dev->vqs = rte_zmalloc(NULL, sizeof(struct virtqueue *) * nr_vq, 0); + if (!dev->vqs) { + SPDK_ERRLOG("failed to allocate %"PRIu16" vqs\n", nr_vq); + return -ENOMEM; + } + + for (i = 0; i < nr_vq; i++) { + ret = virtio_init_queue(dev, i); + if (ret < 0) { + virtio_free_queues(dev); + return ret; + } + } + + dev->max_queues = nr_vq; + dev->fixed_queues_num = fixed_vq_num; + return 0; +} + +/** + * Negotiate virtio features. For virtio_user this will also set + * dev->modern flag if VIRTIO_F_VERSION_1 flag is negotiated. + */ +static int +virtio_negotiate_features(struct virtio_dev *dev, uint64_t req_features) +{ + uint64_t host_features = virtio_dev_backend_ops(dev)->get_features(dev); + int rc; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "guest features = %" PRIx64 "\n", req_features); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "device features = %" PRIx64 "\n", host_features); + + rc = virtio_dev_backend_ops(dev)->set_features(dev, req_features & host_features); + if (rc != 0) { + SPDK_ERRLOG("failed to negotiate device features.\n"); + return rc; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "negotiated features = %" PRIx64 "\n", + dev->negotiated_features); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_FEATURES_OK); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_FEATURES_OK)) { + SPDK_ERRLOG("failed to set FEATURES_OK status!\n"); + /* either the device failed, or we offered some features that + * depend on other, not offered features. + */ + return -EINVAL; + } + + return 0; +} + +int +virtio_dev_construct(struct virtio_dev *vdev, const char *name, + const struct virtio_dev_ops *ops, void *ctx) +{ + int rc; + + vdev->name = strdup(name); + if (vdev->name == NULL) { + return -ENOMEM; + } + + rc = pthread_mutex_init(&vdev->mutex, NULL); + if (rc != 0) { + free(vdev->name); + return -rc; + } + + vdev->backend_ops = ops; + vdev->ctx = ctx; + + return 0; +} + +int +virtio_dev_reset(struct virtio_dev *dev, uint64_t req_features) +{ + req_features |= (1ULL << VIRTIO_F_VERSION_1); + + virtio_dev_stop(dev); + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_ACKNOWLEDGE)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_ACKNOWLEDGE status.\n"); + return -EIO; + } + + virtio_dev_set_status(dev, VIRTIO_CONFIG_S_DRIVER); + if (!(virtio_dev_get_status(dev) & VIRTIO_CONFIG_S_DRIVER)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER status.\n"); + return -EIO; + } + + return virtio_negotiate_features(dev, req_features); +} + +int +virtio_dev_start(struct virtio_dev *vdev, uint16_t max_queues, uint16_t fixed_queue_num) +{ + int ret; + + ret = virtio_alloc_queues(vdev, max_queues, fixed_queue_num); + if (ret < 0) { + return ret; + } + + virtio_dev_set_status(vdev, VIRTIO_CONFIG_S_DRIVER_OK); + if (!(virtio_dev_get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) { + SPDK_ERRLOG("Failed to set VIRTIO_CONFIG_S_DRIVER_OK status.\n"); + return -1; + } + + return 0; +} + +void +virtio_dev_destruct(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->destruct_dev(dev); + pthread_mutex_destroy(&dev->mutex); + free(dev->name); +} + +static void +vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx) +{ + struct vring_desc *dp, *dp_tail; + struct vq_desc_extra *dxp; + uint16_t desc_idx_last = desc_idx; + + dp = &vq->vq_ring.desc[desc_idx]; + dxp = &vq->vq_descx[desc_idx]; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt + dxp->ndescs); + if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) { + while (dp->flags & VRING_DESC_F_NEXT) { + desc_idx_last = dp->next; + dp = &vq->vq_ring.desc[dp->next]; + } + } + dxp->ndescs = 0; + + /* + * We must append the existing free chain, if any, to the end of + * newly freed chain. If the virtqueue was completely used, then + * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above). + */ + if (vq->vq_desc_tail_idx == VQ_RING_DESC_CHAIN_END) { + vq->vq_desc_head_idx = desc_idx; + } else { + dp_tail = &vq->vq_ring.desc[vq->vq_desc_tail_idx]; + dp_tail->next = desc_idx; + } + + vq->vq_desc_tail_idx = desc_idx_last; + dp->next = VQ_RING_DESC_CHAIN_END; +} + +static uint16_t +virtqueue_dequeue_burst_rx(struct virtqueue *vq, void **rx_pkts, + uint32_t *len, uint16_t num) +{ + struct vring_used_elem *uep; + struct virtio_req *cookie; + uint16_t used_idx, desc_idx; + uint16_t i; + + /* Caller does the check */ + for (i = 0; i < num ; i++) { + used_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); + uep = &vq->vq_ring.used->ring[used_idx]; + desc_idx = (uint16_t) uep->id; + len[i] = uep->len; + cookie = (struct virtio_req *)vq->vq_descx[desc_idx].cookie; + + if (spdk_unlikely(cookie == NULL)) { + SPDK_WARNLOG("vring descriptor with no mbuf cookie at %"PRIu16"\n", + vq->vq_used_cons_idx); + break; + } + + rte_prefetch0(cookie); + rx_pkts[i] = cookie; + vq->vq_used_cons_idx++; + vq_ring_free_chain(vq, desc_idx); + vq->vq_descx[desc_idx].cookie = NULL; + } + + return i; +} + +static void +finish_req(struct virtqueue *vq) +{ + struct vring_desc *desc; + uint16_t avail_idx; + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + /* + * Place the head of the descriptor chain into the next slot and make + * it usable to the host. The chain is made available now rather than + * deferring to virtqueue_req_flush() in the hopes that if the host is + * currently running on another CPU, we can keep it processing the new + * descriptor. + */ + avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1)); + vq->vq_ring.avail->ring[avail_idx] = vq->req_start; + vq->vq_avail_idx++; + vq->req_end = VQ_RING_DESC_CHAIN_END; + virtio_wmb(); + vq->vq_ring.avail->idx = vq->vq_avail_idx; + vq->reqs_finished++; +} + +int +virtqueue_req_start(struct virtqueue *vq, void *cookie, int iovcnt) +{ + struct vq_desc_extra *dxp; + + if (iovcnt > vq->vq_free_cnt) { + return iovcnt > vq->vq_nentries ? -EINVAL : -ENOMEM; + } + + if (vq->req_end != VQ_RING_DESC_CHAIN_END) { + finish_req(vq); + } + + vq->req_start = vq->vq_desc_head_idx; + dxp = &vq->vq_descx[vq->req_start]; + dxp->cookie = cookie; + dxp->ndescs = 0; + + return 0; +} + +void +virtqueue_req_flush(struct virtqueue *vq) +{ + uint16_t reqs_finished; + + if (vq->req_end == VQ_RING_DESC_CHAIN_END) { + /* no non-empty requests have been started */ + return; + } + + finish_req(vq); + virtio_mb(); + + reqs_finished = vq->reqs_finished; + vq->reqs_finished = 0; + + if (vq->vdev->negotiated_features & (1ULL << VIRTIO_RING_F_EVENT_IDX)) { + /* Set used event idx to a value the device will never reach. + * This effectively disables interrupts. + */ + vring_used_event(&vq->vq_ring) = vq->vq_used_cons_idx - vq->vq_nentries - 1; + + if (!vring_need_event(vring_avail_event(&vq->vq_ring), + vq->vq_avail_idx, + vq->vq_avail_idx - reqs_finished)) { + return; + } + } else if (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) { + return; + } + + virtio_dev_backend_ops(vq->vdev)->notify_queue(vq->vdev, vq); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_DEV, "Notified backend after xmit\n"); +} + +void +virtqueue_req_abort(struct virtqueue *vq) +{ + struct vring_desc *desc; + + if (vq->req_start == VQ_RING_DESC_CHAIN_END) { + /* no requests have been started */ + return; + } + + desc = &vq->vq_ring.desc[vq->req_end]; + desc->flags &= ~VRING_DESC_F_NEXT; + + vq_ring_free_chain(vq, vq->req_start); + vq->req_start = VQ_RING_DESC_CHAIN_END; +} + +void +virtqueue_req_add_iovs(struct virtqueue *vq, struct iovec *iovs, uint16_t iovcnt, + enum spdk_virtio_desc_type desc_type) +{ + struct vring_desc *desc; + struct vq_desc_extra *dxp; + uint16_t i, prev_head, new_head; + + assert(vq->req_start != VQ_RING_DESC_CHAIN_END); + assert(iovcnt <= vq->vq_free_cnt); + + /* TODO use indirect descriptors if iovcnt is high enough + * or the caller specifies SPDK_VIRTIO_DESC_F_INDIRECT + */ + + prev_head = vq->req_end; + new_head = vq->vq_desc_head_idx; + for (i = 0; i < iovcnt; ++i) { + desc = &vq->vq_ring.desc[new_head]; + + if (!vq->vdev->is_hw) { + desc->addr = (uintptr_t)iovs[i].iov_base; + } else { + desc->addr = spdk_vtophys(iovs[i].iov_base); + } + + desc->len = iovs[i].iov_len; + /* always set NEXT flag. unset it on the last descriptor + * in the request-ending function. + */ + desc->flags = desc_type | VRING_DESC_F_NEXT; + + prev_head = new_head; + new_head = desc->next; + } + + dxp = &vq->vq_descx[vq->req_start]; + dxp->ndescs += iovcnt; + + vq->req_end = prev_head; + vq->vq_desc_head_idx = new_head; + vq->vq_free_cnt = (uint16_t)(vq->vq_free_cnt - iovcnt); + if (vq->vq_desc_head_idx == VQ_RING_DESC_CHAIN_END) { + assert(vq->vq_free_cnt == 0); + vq->vq_desc_tail_idx = VQ_RING_DESC_CHAIN_END; + } +} + +#define DESC_PER_CACHELINE (RTE_CACHE_LINE_SIZE / sizeof(struct vring_desc)) +uint16_t +virtio_recv_pkts(struct virtqueue *vq, void **io, uint32_t *len, uint16_t nb_pkts) +{ + uint16_t nb_used, num; + + nb_used = vq->vq_ring.used->idx - vq->vq_used_cons_idx; + virtio_rmb(); + + num = (uint16_t)(spdk_likely(nb_used <= nb_pkts) ? nb_used : nb_pkts); + if (spdk_likely(num > DESC_PER_CACHELINE)) { + num = num - ((vq->vq_used_cons_idx + num) % DESC_PER_CACHELINE); + } + + return virtqueue_dequeue_burst_rx(vq, io, len, num); +} + +int +virtio_dev_acquire_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("requested vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return -1; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL || vq->owner_thread != NULL) { + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return 0; +} + +int32_t +virtio_dev_find_and_acquire_queue(struct virtio_dev *vdev, uint16_t start_index) +{ + struct virtqueue *vq = NULL; + uint16_t i; + + pthread_mutex_lock(&vdev->mutex); + for (i = start_index; i < vdev->max_queues; ++i) { + vq = vdev->vqs[i]; + if (vq != NULL && vq->owner_thread == NULL) { + break; + } + } + + if (vq == NULL || i == vdev->max_queues) { + SPDK_ERRLOG("no more unused virtio queues with idx >= %"PRIu16".\n", start_index); + pthread_mutex_unlock(&vdev->mutex); + return -1; + } + + vq->owner_thread = spdk_get_thread(); + pthread_mutex_unlock(&vdev->mutex); + return i; +} + +struct spdk_thread * +virtio_dev_queue_get_thread(struct virtio_dev *vdev, uint16_t index) +{ + struct spdk_thread *thread = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16"\n", + index, vdev->max_queues); + abort(); /* This is not recoverable */ + } + + pthread_mutex_lock(&vdev->mutex); + thread = vdev->vqs[index]->owner_thread; + pthread_mutex_unlock(&vdev->mutex); + + return thread; +} + +bool +virtio_dev_queue_is_acquired(struct virtio_dev *vdev, uint16_t index) +{ + return virtio_dev_queue_get_thread(vdev, index) != NULL; +} + +void +virtio_dev_release_queue(struct virtio_dev *vdev, uint16_t index) +{ + struct virtqueue *vq = NULL; + + if (index >= vdev->max_queues) { + SPDK_ERRLOG("given vq index %"PRIu16" exceeds max queue count %"PRIu16".\n", + index, vdev->max_queues); + return; + } + + pthread_mutex_lock(&vdev->mutex); + vq = vdev->vqs[index]; + if (vq == NULL) { + SPDK_ERRLOG("virtqueue at index %"PRIu16" is not initialized.\n", index); + pthread_mutex_unlock(&vdev->mutex); + return; + } + + assert(vq->owner_thread == spdk_get_thread()); + vq->owner_thread = NULL; + pthread_mutex_unlock(&vdev->mutex); +} + +int +virtio_dev_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + return virtio_dev_backend_ops(dev)->read_dev_cfg(dev, offset, dst, length); +} + +int +virtio_dev_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + return virtio_dev_backend_ops(dev)->write_dev_cfg(dev, offset, src, length); +} + +void +virtio_dev_stop(struct virtio_dev *dev) +{ + virtio_dev_backend_ops(dev)->set_status(dev, VIRTIO_CONFIG_S_RESET); + /* flush status write */ + virtio_dev_backend_ops(dev)->get_status(dev); + virtio_free_queues(dev); +} + +void +virtio_dev_set_status(struct virtio_dev *dev, uint8_t status) +{ + if (status != VIRTIO_CONFIG_S_RESET) { + status |= virtio_dev_backend_ops(dev)->get_status(dev); + } + + virtio_dev_backend_ops(dev)->set_status(dev, status); +} + +uint8_t +virtio_dev_get_status(struct virtio_dev *dev) +{ + return virtio_dev_backend_ops(dev)->get_status(dev); +} + +const struct virtio_dev_ops * +virtio_dev_backend_ops(struct virtio_dev *dev) +{ + return dev->backend_ops; +} + +void +virtio_dev_dump_json_info(struct virtio_dev *hw, struct spdk_json_write_ctx *w) +{ + spdk_json_write_name(w, "virtio"); + spdk_json_write_object_begin(w); + + spdk_json_write_name(w, "vq_count"); + spdk_json_write_uint32(w, hw->max_queues); + + spdk_json_write_name(w, "vq_size"); + spdk_json_write_uint32(w, virtio_dev_backend_ops(hw)->get_queue_size(hw, 0)); + + virtio_dev_backend_ops(hw)->dump_json_info(hw, w); + + spdk_json_write_object_end(w); +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_dev", SPDK_LOG_VIRTIO_DEV) diff --git a/src/spdk/lib/virtio/virtio_pci.c b/src/spdk/lib/virtio/virtio_pci.c new file mode 100644 index 00000000..c21492a7 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_pci.c @@ -0,0 +1,590 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/mmio.h" +#include "spdk/string.h" +#include "spdk/env.h" + +#include "spdk_internal/virtio.h" + +struct virtio_hw { + uint8_t use_msix; + uint32_t notify_off_multiplier; + uint8_t *isr; + uint16_t *notify_base; + + struct { + /** Mem-mapped resources from given PCI BAR */ + void *vaddr; + + /** Length of the address space */ + uint32_t len; + } pci_bar[6]; + + struct virtio_pci_common_cfg *common_cfg; + struct spdk_pci_device *pci_dev; + + /** Device-specific PCI config space */ + void *dev_cfg; +}; + +struct virtio_pci_probe_ctx { + virtio_pci_create_cb enum_cb; + void *enum_ctx; + uint16_t device_id; +}; + +/* + * Following macros are derived from linux/pci_regs.h, however, + * we can't simply include that header here, as there is no such + * file for non-Linux platform. + */ +#define PCI_CAPABILITY_LIST 0x34 +#define PCI_CAP_ID_VNDR 0x09 +#define PCI_CAP_ID_MSIX 0x11 + +static inline int +check_vq_phys_addr_ok(struct virtqueue *vq) +{ + /* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit, + * and only accepts 32 bit page frame number. + * Check if the allocated physical memory exceeds 16TB. + */ + if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >> + (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) { + SPDK_ERRLOG("vring address shouldn't be above 16TB!\n"); + return 0; + } + + return 1; +} + +static void +free_virtio_hw(struct virtio_hw *hw) +{ + unsigned i; + + for (i = 0; i < 6; ++i) { + if (hw->pci_bar[i].vaddr == NULL) { + continue; + } + + spdk_pci_device_unmap_bar(hw->pci_dev, i, hw->pci_bar[i].vaddr); + } + + free(hw); +} + +static void +pci_dump_json_info(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr((struct spdk_pci_device *)hw->pci_dev); + char addr[32]; + + spdk_json_write_name(w, "type"); + if (dev->modern) { + spdk_json_write_string(w, "pci-modern"); + } else { + spdk_json_write_string(w, "pci-legacy"); + } + + spdk_json_write_name(w, "pci_address"); + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + spdk_json_write_string(w, addr); +} + +static void +pci_write_json_config(struct virtio_dev *dev, struct spdk_json_write_ctx *w) +{ + struct virtio_hw *hw = dev->ctx; + struct spdk_pci_addr pci_addr = spdk_pci_device_get_addr(hw->pci_dev); + char addr[32]; + + spdk_pci_addr_fmt(addr, sizeof(addr), &pci_addr); + + spdk_json_write_named_string(w, "trtype", "pci"); + spdk_json_write_named_string(w, "traddr", addr); +} + +static inline void +io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi) +{ + spdk_mmio_write_4(lo, val & ((1ULL << 32) - 1)); + spdk_mmio_write_4(hi, val >> 32); +} + +static int +modern_read_dev_config(struct virtio_dev *dev, size_t offset, + void *dst, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + uint8_t *p; + uint8_t old_gen, new_gen; + + do { + old_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + + p = dst; + for (i = 0; i < length; i++) { + *p++ = spdk_mmio_read_1((uint8_t *)hw->dev_cfg + offset + i); + } + + new_gen = spdk_mmio_read_1(&hw->common_cfg->config_generation); + } while (old_gen != new_gen); + + return 0; +} + +static int +modern_write_dev_config(struct virtio_dev *dev, size_t offset, + const void *src, int length) +{ + struct virtio_hw *hw = dev->ctx; + int i; + const uint8_t *p = src; + + for (i = 0; i < length; i++) { + spdk_mmio_write_1(((uint8_t *)hw->dev_cfg) + offset + i, *p++); + } + + return 0; +} + +static uint64_t +modern_get_features(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + uint32_t features_lo, features_hi; + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 0); + features_lo = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + spdk_mmio_write_4(&hw->common_cfg->device_feature_select, 1); + features_hi = spdk_mmio_read_4(&hw->common_cfg->device_feature); + + return ((uint64_t)features_hi << 32) | features_lo; +} + +static int +modern_set_features(struct virtio_dev *dev, uint64_t features) +{ + struct virtio_hw *hw = dev->ctx; + + if ((features & (1ULL << VIRTIO_F_VERSION_1)) == 0) { + SPDK_ERRLOG("VIRTIO_F_VERSION_1 feature is not enabled.\n"); + return -EINVAL; + } + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 0); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features & ((1ULL << 32) - 1)); + + spdk_mmio_write_4(&hw->common_cfg->guest_feature_select, 1); + spdk_mmio_write_4(&hw->common_cfg->guest_feature, features >> 32); + + dev->negotiated_features = features; + + return 0; +} + +static void +modern_destruct_dev(struct virtio_dev *vdev) +{ + struct virtio_hw *hw = vdev->ctx; + struct spdk_pci_device *pci_dev = hw->pci_dev; + + free_virtio_hw(hw); + spdk_pci_device_detach(pci_dev); +} + +static uint8_t +modern_get_status(struct virtio_dev *dev) +{ + struct virtio_hw *hw = dev->ctx; + + return spdk_mmio_read_1(&hw->common_cfg->device_status); +} + +static void +modern_set_status(struct virtio_dev *dev, uint8_t status) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_1(&hw->common_cfg->device_status, status); +} + +static uint16_t +modern_get_queue_size(struct virtio_dev *dev, uint16_t queue_id) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, queue_id); + return spdk_mmio_read_2(&hw->common_cfg->queue_size); +} + +static int +modern_setup_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + uint64_t desc_addr, avail_addr, used_addr; + uint16_t notify_off; + void *queue_mem; + uint64_t queue_mem_phys_addr; + + /* To ensure physical address contiguity we make the queue occupy + * only a single hugepage (2MB). As of Virtio 1.0, the queue size + * always falls within this limit. + */ + if (vq->vq_ring_size > 0x200000) { + return -ENOMEM; + } + + queue_mem = spdk_dma_zmalloc(vq->vq_ring_size, 0x200000, &queue_mem_phys_addr); + if (queue_mem == NULL) { + return -ENOMEM; + } + + vq->vq_ring_mem = queue_mem_phys_addr; + vq->vq_ring_virt_mem = queue_mem; + + if (!check_vq_phys_addr_ok(vq)) { + spdk_dma_free(queue_mem); + return -ENOMEM; + } + + desc_addr = vq->vq_ring_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = (avail_addr + offsetof(struct vring_avail, ring[vq->vq_nentries]) + + VIRTIO_PCI_VRING_ALIGN - 1) & ~(VIRTIO_PCI_VRING_ALIGN - 1); + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + notify_off = spdk_mmio_read_2(&hw->common_cfg->queue_notify_off); + vq->notify_addr = (void *)((uint8_t *)hw->notify_base + + notify_off * hw->notify_off_multiplier); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 1); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "queue %"PRIu16" addresses:\n", vq->vq_queue_index); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t desc_addr: %" PRIx64 "\n", desc_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t aval_addr: %" PRIx64 "\n", avail_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t used_addr: %" PRIx64 "\n", used_addr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "\t notify addr: %p (notify offset: %"PRIu16")\n", + vq->notify_addr, notify_off); + + return 0; +} + +static void +modern_del_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + struct virtio_hw *hw = dev->ctx; + + spdk_mmio_write_2(&hw->common_cfg->queue_select, vq->vq_queue_index); + + io_write64_twopart(0, &hw->common_cfg->queue_desc_lo, + &hw->common_cfg->queue_desc_hi); + io_write64_twopart(0, &hw->common_cfg->queue_avail_lo, + &hw->common_cfg->queue_avail_hi); + io_write64_twopart(0, &hw->common_cfg->queue_used_lo, + &hw->common_cfg->queue_used_hi); + + spdk_mmio_write_2(&hw->common_cfg->queue_enable, 0); + + spdk_dma_free(vq->vq_ring_virt_mem); +} + +static void +modern_notify_queue(struct virtio_dev *dev, struct virtqueue *vq) +{ + spdk_mmio_write_2(vq->notify_addr, vq->vq_queue_index); +} + +static const struct virtio_dev_ops modern_ops = { + .read_dev_cfg = modern_read_dev_config, + .write_dev_cfg = modern_write_dev_config, + .get_status = modern_get_status, + .set_status = modern_set_status, + .get_features = modern_get_features, + .set_features = modern_set_features, + .destruct_dev = modern_destruct_dev, + .get_queue_size = modern_get_queue_size, + .setup_queue = modern_setup_queue, + .del_queue = modern_del_queue, + .notify_queue = modern_notify_queue, + .dump_json_info = pci_dump_json_info, + .write_json_config = pci_write_json_config, +}; + +static void * +get_cfg_addr(struct virtio_hw *hw, struct virtio_pci_cap *cap) +{ + uint8_t bar = cap->bar; + uint32_t length = cap->length; + uint32_t offset = cap->offset; + + if (bar > 5) { + SPDK_ERRLOG("invalid bar: %"PRIu8"\n", bar); + return NULL; + } + + if (offset + length < offset) { + SPDK_ERRLOG("offset(%"PRIu32") + length(%"PRIu32") overflows\n", + offset, length); + return NULL; + } + + if (offset + length > hw->pci_bar[bar].len) { + SPDK_ERRLOG("invalid cap: overflows bar space: %"PRIu32" > %"PRIu32"\n", + offset + length, hw->pci_bar[bar].len); + return NULL; + } + + if (hw->pci_bar[bar].vaddr == NULL) { + SPDK_ERRLOG("bar %"PRIu8" base addr is NULL\n", bar); + return NULL; + } + + return hw->pci_bar[bar].vaddr + offset; +} + +static int +virtio_read_caps(struct virtio_hw *hw) +{ + uint8_t pos; + struct virtio_pci_cap cap; + int ret; + + ret = spdk_pci_device_cfg_read(hw->pci_dev, &pos, 1, PCI_CAPABILITY_LIST); + if (ret < 0) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "failed to read pci capability list\n"); + return ret; + } + + while (pos) { + ret = spdk_pci_device_cfg_read(hw->pci_dev, &cap, sizeof(cap), pos); + if (ret < 0) { + SPDK_ERRLOG("failed to read pci cap at pos: %"PRIx8"\n", pos); + break; + } + + if (cap.cap_vndr == PCI_CAP_ID_MSIX) { + hw->use_msix = 1; + } + + if (cap.cap_vndr != PCI_CAP_ID_VNDR) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] skipping non VNDR cap id: %02"PRIx8"\n", + pos, cap.cap_vndr); + goto next; + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, + "[%2"PRIx8"] cfg type: %"PRIu8", bar: %"PRIu8", offset: %04"PRIx32", len: %"PRIu32"\n", + pos, cap.cfg_type, cap.bar, cap.offset, cap.length); + + switch (cap.cfg_type) { + case VIRTIO_PCI_CAP_COMMON_CFG: + hw->common_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_NOTIFY_CFG: + spdk_pci_device_cfg_read(hw->pci_dev, &hw->notify_off_multiplier, + 4, pos + sizeof(cap)); + hw->notify_base = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_DEVICE_CFG: + hw->dev_cfg = get_cfg_addr(hw, &cap); + break; + case VIRTIO_PCI_CAP_ISR_CFG: + hw->isr = get_cfg_addr(hw, &cap); + break; + } + +next: + pos = cap.cap_next; + } + + if (hw->common_cfg == NULL || hw->notify_base == NULL || + hw->dev_cfg == NULL || hw->isr == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "no modern virtio pci device found.\n"); + if (ret < 0) { + return ret; + } else { + return -EINVAL; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "found modern virtio pci device.\n"); + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "common cfg mapped at: %p\n", hw->common_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "device cfg mapped at: %p\n", hw->dev_cfg); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "isr cfg mapped at: %p\n", hw->isr); + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_PCI, "notify base: %p, notify off multiplier: %u\n", + hw->notify_base, hw->notify_off_multiplier); + + return 0; +} + +static int +virtio_pci_dev_probe(struct spdk_pci_device *pci_dev, struct virtio_pci_probe_ctx *ctx) +{ + struct virtio_hw *hw; + uint8_t *bar_vaddr; + uint64_t bar_paddr, bar_len; + int rc; + unsigned i; + char bdf[32]; + struct spdk_pci_addr addr; + + addr = spdk_pci_device_get_addr(pci_dev); + rc = spdk_pci_addr_fmt(bdf, sizeof(bdf), &addr); + if (rc != 0) { + SPDK_ERRLOG("Ignoring a device with non-parseable PCI address\n"); + return -1; + } + + hw = calloc(1, sizeof(*hw)); + if (hw == NULL) { + SPDK_ERRLOG("%s: calloc failed\n", bdf); + return -1; + } + + hw->pci_dev = pci_dev; + + for (i = 0; i < 6; ++i) { + rc = spdk_pci_device_map_bar(pci_dev, i, (void *) &bar_vaddr, &bar_paddr, + &bar_len); + if (rc != 0) { + SPDK_ERRLOG("%s: failed to memmap PCI BAR %u\n", bdf, i); + free_virtio_hw(hw); + return -1; + } + + hw->pci_bar[i].vaddr = bar_vaddr; + hw->pci_bar[i].len = bar_len; + } + + /* Virtio PCI caps exist only on modern PCI devices. + * Legacy devices are not supported. + */ + if (virtio_read_caps(hw) != 0) { + SPDK_NOTICELOG("Ignoring legacy PCI device at %s\n", bdf); + free_virtio_hw(hw); + return -1; + } + + rc = ctx->enum_cb((struct virtio_pci_ctx *)hw, ctx->enum_ctx); + if (rc != 0) { + free_virtio_hw(hw); + } + + return rc; +} + +static int +virtio_pci_dev_probe_cb(void *probe_ctx, struct spdk_pci_device *pci_dev) +{ + struct virtio_pci_probe_ctx *ctx = probe_ctx; + uint16_t pci_device_id = spdk_pci_device_get_device_id(pci_dev); + + if (pci_device_id != ctx->device_id) { + return 1; + } + + return virtio_pci_dev_probe(pci_dev, ctx); +} + +int +virtio_pci_dev_enumerate(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_virtio_enumerate(virtio_pci_dev_probe_cb, &ctx); +} + +int +virtio_pci_dev_attach(virtio_pci_create_cb enum_cb, void *enum_ctx, + uint16_t pci_device_id, struct spdk_pci_addr *pci_address) +{ + struct virtio_pci_probe_ctx ctx; + + if (!spdk_process_is_primary()) { + SPDK_WARNLOG("virtio_pci secondary process support is not implemented yet.\n"); + return 0; + } + + ctx.enum_cb = enum_cb; + ctx.enum_ctx = enum_ctx; + ctx.device_id = pci_device_id; + + return spdk_pci_virtio_device_attach(virtio_pci_dev_probe_cb, &ctx, pci_address); +} + +int +virtio_pci_dev_init(struct virtio_dev *vdev, const char *name, + struct virtio_pci_ctx *pci_ctx) +{ + int rc; + + rc = virtio_dev_construct(vdev, name, &modern_ops, pci_ctx); + if (rc != 0) { + return rc; + } + + vdev->is_hw = 1; + vdev->modern = 1; + + return 0; +} + +SPDK_LOG_REGISTER_COMPONENT("virtio_pci", SPDK_LOG_VIRTIO_PCI) diff --git a/src/spdk/lib/virtio/virtio_user.c b/src/spdk/lib/virtio/virtio_user.c new file mode 100644 index 00000000..5dadda61 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_user.c @@ -0,0 +1,621 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include + +#include + +#include +#include +#include + +#include "virtio_user/vhost.h" +#include "spdk/string.h" + +#include "spdk_internal/virtio.h" + +#define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \ + ((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \ + (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) + +static int +virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come + * firstly because vhost depends on this msg to allocate virtqueue + * pair. + */ + struct vhost_vring_file file; + + file.index = queue_sel; + file.fd = dev->callfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file); +} + +static int +virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_file file; + struct vhost_vring_state state; + struct vring *vring = &dev->vrings[queue_sel]; + struct vhost_vring_addr addr = { + .index = queue_sel, + .desc_user_addr = (uint64_t)(uintptr_t)vring->desc, + .avail_user_addr = (uint64_t)(uintptr_t)vring->avail, + .used_user_addr = (uint64_t)(uintptr_t)vring->used, + .log_guest_addr = 0, + .flags = 0, /* disable log */ + }; + int rc; + + state.index = queue_sel; + state.num = vring->num; + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state); + if (rc < 0) { + return rc; + } + + state.index = queue_sel; + state.num = 0; /* no reservation */ + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state); + if (rc < 0) { + return rc; + } + + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr); + if (rc < 0) { + return rc; + } + + /* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes + * lastly because vhost depends on this msg to judge if + * virtio is ready. + */ + file.index = queue_sel; + file.fd = dev->kickfds[queue_sel]; + return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file); +} + +static int +virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + + state.index = queue_sel; + state.num = 0; + + return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state); +} + +static int +virtio_user_queue_setup(struct virtio_dev *vdev, + int (*fn)(struct virtio_dev *, uint32_t)) +{ + uint32_t i; + int rc; + + for (i = 0; i < vdev->max_queues; ++i) { + rc = fn(vdev, i); + if (rc < 0) { + SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i); + return rc; + } + } + + return 0; +} + +static int +virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct virtio_dev *vdev = cb_ctx; + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int ret; + + /* We have to resend all mappings anyway, so don't bother with any + * page tracking. + */ + ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL); + if (ret < 0) { + return ret; + } + + /* We have to send SET_VRING_ADDR to make rte_vhost flush a pending + * SET_MEM_TABLE... + */ + ret = virtio_user_queue_setup(vdev, virtio_user_kick_queue); + if (ret < 0) { + return ret; + } + + /* Since we might want to use that mapping straight away, we have to + * make sure the guest has already processed our SET_MEM_TABLE message. + * F_REPLY_ACK is just a feature and the host is not obliged to + * support it, so we send a simple message that always has a response + * and we wait for that response. Messages are always processed in order. + */ + return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); +} + +static int +virtio_user_register_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + const struct spdk_mem_map_ops virtio_user_map_ops = { + .notify_cb = virtio_user_map_notify, + .are_contiguous = NULL + }; + + dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev); + if (dev->mem_map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + return -1; + } + + return 0; +} + +static void +virtio_user_unregister_mem(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_mem_map_free(&dev->mem_map); +} + +static int +virtio_user_start_device(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t host_max_queues; + int ret; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 && + vdev->max_queues > 1 + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the " + "host doesn't support VHOST_USER_PROTOCOL_F_MQ. " + "Only one request queue will be used.\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num); + vdev->max_queues = 1 + vdev->fixed_queues_num; + } + + /* negotiate the number of I/O queues. */ + ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues); + if (ret < 0) { + return ret; + } + + if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) { + SPDK_WARNLOG("%s: requested %"PRIu16" request queues" + "but only %"PRIu64" available\n", + vdev->name, vdev->max_queues - vdev->fixed_queues_num, + host_max_queues); + vdev->max_queues = host_max_queues; + } + + /* tell vhost to create queues */ + ret = virtio_user_queue_setup(vdev, virtio_user_create_queue); + if (ret < 0) { + return ret; + } + + ret = virtio_user_register_mem(vdev); + if (ret < 0) { + return ret; + } + + return 0; +} + +static int +virtio_user_stop_device(struct virtio_dev *vdev) +{ + int ret; + + ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue); + /* a queue might fail to stop for various reasons, e.g. socket + * connection going down, but this mustn't prevent us from freeing + * the mem map. + */ + virtio_user_unregister_mem(vdev); + return ret; +} + +static int +virtio_user_dev_setup(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint16_t i; + + dev->vhostfd = -1; + + for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) { + dev->callfds[i] = -1; + dev->kickfds[i] = -1; + } + + dev->ops = &ops_user; + + return dev->ops->setup(dev); +} + +static int +virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset, + void *dst, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = 0; + cfg.size = VHOST_USER_MAX_CONFIG_SIZE; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + memcpy(dst, cfg.region + offset, length); + return 0; +} + +static int +virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset, + const void *src, int length) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_user_config cfg = {0}; + int rc; + + if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) { + return -ENOTSUP; + } + + cfg.offset = offset; + cfg.size = length; + memcpy(cfg.region, src, length); + + rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg); + if (rc < 0) { + SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + return 0; +} + +static void +virtio_user_set_status(struct virtio_dev *vdev, uint8_t status) +{ + struct virtio_user_dev *dev = vdev->ctx; + int rc = 0; + + if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) && + status != VIRTIO_CONFIG_S_RESET) { + rc = -1; + } else if (status & VIRTIO_CONFIG_S_DRIVER_OK) { + rc = virtio_user_start_device(vdev); + } else if (status == VIRTIO_CONFIG_S_RESET && + (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { + rc = virtio_user_stop_device(vdev); + } + + if (rc != 0) { + dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET; + } else { + dev->status = status; + } +} + +static uint8_t +virtio_user_get_status(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + return dev->status; +} + +static uint64_t +virtio_user_get_features(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t features; + int rc; + + rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features); + if (rc < 0) { + SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc)); + return 0; + } + + return features; +} + +static int +virtio_user_set_features(struct virtio_dev *vdev, uint64_t features) +{ + struct virtio_user_dev *dev = vdev->ctx; + uint64_t protocol_features; + int ret; + + ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features); + if (ret < 0) { + return ret; + } + + vdev->negotiated_features = features; + vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1); + + if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + /* nothing else to do */ + return 0; + } + + ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES; + ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features); + if (ret < 0) { + return ret; + } + + dev->protocol_features = protocol_features; + return 0; +} + +static uint16_t +virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id) +{ + struct virtio_user_dev *dev = vdev->ctx; + + /* Currently each queue has same queue size */ + return dev->queue_size; +} + +static int +virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + struct virtio_user_dev *dev = vdev->ctx; + struct vhost_vring_state state; + uint16_t queue_idx = vq->vq_queue_index; + void *queue_mem; + uint64_t desc_addr, avail_addr, used_addr; + int callfd, kickfd, rc; + + if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) { + SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx); + return -EEXIST; + } + + /* May use invalid flag, but some backend uses kickfd and + * callfd as criteria to judge if dev is alive. so finally we + * use real event_fd. + */ + callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (callfd < 0) { + SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno)); + return -errno; + } + + kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK); + if (kickfd < 0) { + SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno)); + close(callfd); + return -errno; + } + + queue_mem = spdk_dma_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL); + if (queue_mem == NULL) { + close(kickfd); + close(callfd); + return -ENOMEM; + } + + vq->vq_ring_mem = SPDK_VTOPHYS_ERROR; + vq->vq_ring_virt_mem = queue_mem; + + state.index = vq->vq_queue_index; + state.num = 0; + + if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) { + rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state); + if (rc < 0) { + SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n", + spdk_strerror(-rc)); + spdk_dma_free(queue_mem); + return -rc; + } + } + + dev->callfds[queue_idx] = callfd; + dev->kickfds[queue_idx] = kickfd; + + desc_addr = (uintptr_t)vq->vq_ring_virt_mem; + avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc); + used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail, + ring[vq->vq_nentries]), + VIRTIO_PCI_VRING_ALIGN); + + dev->vrings[queue_idx].num = vq->vq_nentries; + dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr; + dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr; + dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr; + + return 0; +} + +static void +virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + /* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU + * correspondingly stops the ioeventfds, and reset the status of + * the device. + * For modern devices, set queue desc, avail, used in PCI bar to 0, + * not see any more behavior in QEMU. + * + * Here we just care about what information to deliver to vhost-user. + * So we just close ioeventfd for now. + */ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->callfds[vq->vq_queue_index]); + close(dev->kickfds[vq->vq_queue_index]); + dev->callfds[vq->vq_queue_index] = -1; + dev->kickfds[vq->vq_queue_index] = -1; + + spdk_dma_free(vq->vq_ring_virt_mem); +} + +static void +virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq) +{ + uint64_t buf = 1; + struct virtio_user_dev *dev = vdev->ctx; + + if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) { + SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno)); + } +} + +static void +virtio_user_destroy(struct virtio_dev *vdev) +{ + struct virtio_user_dev *dev = vdev->ctx; + + close(dev->vhostfd); + free(dev); +} + +static void +virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_name(w, "type"); + spdk_json_write_string(w, "user"); + + spdk_json_write_name(w, "socket"); + spdk_json_write_string(w, dev->path); +} + +static void +virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w) +{ + struct virtio_user_dev *dev = vdev->ctx; + + spdk_json_write_named_string(w, "trtype", "user"); + spdk_json_write_named_string(w, "traddr", dev->path); + spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num); + spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0)); +} + +static const struct virtio_dev_ops virtio_user_ops = { + .read_dev_cfg = virtio_user_read_dev_config, + .write_dev_cfg = virtio_user_write_dev_config, + .get_status = virtio_user_get_status, + .set_status = virtio_user_set_status, + .get_features = virtio_user_get_features, + .set_features = virtio_user_set_features, + .destruct_dev = virtio_user_destroy, + .get_queue_size = virtio_user_get_queue_size, + .setup_queue = virtio_user_setup_queue, + .del_queue = virtio_user_del_queue, + .notify_queue = virtio_user_notify_queue, + .dump_json_info = virtio_user_dump_json_info, + .write_json_config = virtio_user_write_json_config, +}; + +int +virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path, + uint32_t queue_size) +{ + struct virtio_user_dev *dev; + int rc; + + if (name == NULL) { + SPDK_ERRLOG("No name gived for controller: %s\n", path); + return -EINVAL; + } + + dev = calloc(1, sizeof(*dev)); + if (dev == NULL) { + return -ENOMEM; + } + + rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev); + if (rc != 0) { + SPDK_ERRLOG("Failed to init device: %s\n", path); + free(dev); + return rc; + } + + vdev->is_hw = 0; + + snprintf(dev->path, PATH_MAX, "%s", path); + dev->queue_size = queue_size; + + rc = virtio_user_dev_setup(vdev); + if (rc < 0) { + SPDK_ERRLOG("backend set up fails\n"); + goto err; + } + + rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL); + if (rc < 0) { + SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc)); + goto err; + } + + return 0; + +err: + virtio_dev_destruct(vdev); + return rc; +} diff --git a/src/spdk/lib/virtio/virtio_user/vhost.h b/src/spdk/lib/virtio/virtio_user/vhost.h new file mode 100644 index 00000000..0ac7c5b1 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_user/vhost.h @@ -0,0 +1,113 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHOST_NET_USER_H +#define _VHOST_NET_USER_H + +#include "spdk/stdinc.h" + +#include + +#include "spdk_internal/log.h" +#include "spdk_internal/virtio.h" + +#define VHOST_USER_MAX_CONFIG_SIZE 256 + +#ifndef VHOST_USER_PROTOCOL_F_MQ +#define VHOST_USER_PROTOCOL_F_MQ 0 +#endif + +#ifndef VHOST_USER_PROTOCOL_F_CONFIG +#define VHOST_USER_PROTOCOL_F_CONFIG 9 +#endif + +enum vhost_user_request { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_OWNER = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_GET_CONFIG = 24, + VHOST_USER_SET_CONFIG = 25, + VHOST_USER_MAX +}; + +struct virtio_user_backend_ops; + +struct virtio_user_dev { + int vhostfd; + + int callfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + int kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES]; + uint32_t queue_size; + + uint8_t status; + char path[PATH_MAX]; + uint64_t protocol_features; + struct vring vrings[SPDK_VIRTIO_MAX_VIRTQUEUES]; + struct virtio_user_backend_ops *ops; + struct spdk_mem_map *mem_map; +}; + +struct virtio_user_backend_ops { + int (*setup)(struct virtio_user_dev *dev); + int (*send_request)(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg); +}; + +/* get/set config msg */ +struct vhost_user_config { + uint32_t offset; + uint32_t size; + uint32_t flags; + uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; +}; + +extern struct virtio_user_backend_ops ops_user; + +#endif diff --git a/src/spdk/lib/virtio/virtio_user/vhost_user.c b/src/spdk/lib/virtio/virtio_user/vhost_user.c new file mode 100644 index 00000000..46765af5 --- /dev/null +++ b/src/spdk/lib/virtio/virtio_user/vhost_user.c @@ -0,0 +1,518 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "vhost.h" + +#include "spdk/string.h" + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION 0x1 + +#define VHOST_MEMORY_MAX_NREGIONS 8 + +/** Fixed-size vhost_memory struct */ +struct vhost_memory_padded { + uint32_t nregions; + uint32_t padding; + struct vhost_memory_region regions[VHOST_MEMORY_MAX_NREGIONS]; +}; + +struct vhost_user_msg { + enum vhost_user_request request; + +#define VHOST_USER_VERSION_MASK 0x3 +#define VHOST_USER_REPLY_MASK (0x1 << 2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK 0xff +#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + struct vhost_memory_padded memory; + struct vhost_user_config cfg; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; +} __attribute((packed)); + +#define VHOST_USER_HDR_SIZE offsetof(struct vhost_user_msg, payload.u64) +#define VHOST_USER_PAYLOAD_SIZE \ + (sizeof(struct vhost_user_msg) - VHOST_USER_HDR_SIZE) + +static int +vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num) +{ + int r; + struct msghdr msgh; + struct iovec iov; + size_t fd_size = fd_num * sizeof(int); + char control[CMSG_SPACE(fd_size)]; + struct cmsghdr *cmsg; + + memset(&msgh, 0, sizeof(msgh)); + memset(control, 0, sizeof(control)); + + iov.iov_base = (uint8_t *)buf; + iov.iov_len = len; + + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + + if (fds && fd_num > 0) { + msgh.msg_control = control; + msgh.msg_controllen = sizeof(control); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_len = CMSG_LEN(fd_size); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + memcpy(CMSG_DATA(cmsg), fds, fd_size); + } else { + msgh.msg_control = NULL; + msgh.msg_controllen = 0; + } + + do { + r = sendmsg(fd, &msgh, 0); + } while (r < 0 && errno == EINTR); + + if (r == -1) { + return -errno; + } + + return 0; +} + +static int +vhost_user_read(int fd, struct vhost_user_msg *msg) +{ + uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION; + ssize_t ret; + size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload; + + ret = recv(fd, (void *)msg, sz_hdr, 0); + if ((size_t)ret != sz_hdr) { + SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n", + ret, sz_hdr); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + + /* validate msg flags */ + if (msg->flags != (valid_flags)) { + SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n", + msg->flags, valid_flags); + return -EIO; + } + + sz_payload = msg->size; + + if (sizeof(*msg) - sz_hdr < sz_payload) { + SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n", + sz_payload, sizeof(*msg) - sz_hdr); + return -EIO; + } + + if (sz_payload) { + ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0); + if ((size_t)ret != sz_payload) { + SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n", + ret, msg->size); + if (ret == -1) { + return -errno; + } else { + return -EBUSY; + } + } + } + + return 0; +} + +struct hugepage_file_info { + uint64_t addr; /**< virtual addr */ + size_t size; /**< the file size */ + char path[PATH_MAX]; /**< path to backing file */ +}; + +/* Two possible options: + * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file + * array. This is simple but cannot be used in secondary process because + * secondary process will close and munmap that file. + * 2. Match HUGEFILE_FMT to find hugepage files directly. + * + * We choose option 2. + */ +static int +get_hugepage_file_info(struct hugepage_file_info huges[], int max) +{ + int idx, rc; + FILE *f; + char buf[BUFSIZ], *tmp, *tail; + char *str_underline, *str_start; + int huge_index; + uint64_t v_start, v_end; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + SPDK_ERRLOG("cannot open /proc/self/maps\n"); + rc = -errno; + assert(rc < 0); /* scan-build hack */ + return rc; + } + + idx = 0; + while (fgets(buf, sizeof(buf), f) != NULL) { + if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) { + SPDK_ERRLOG("Failed to parse address\n"); + rc = -EIO; + goto out; + } + + tmp = strchr(buf, ' ') + 1; /** skip address */ + tmp = strchr(tmp, ' ') + 1; /** skip perm */ + tmp = strchr(tmp, ' ') + 1; /** skip offset */ + tmp = strchr(tmp, ' ') + 1; /** skip dev */ + tmp = strchr(tmp, ' ') + 1; /** skip inode */ + while (*tmp == ' ') { /** skip spaces */ + tmp++; + } + tail = strrchr(tmp, '\n'); /** remove newline if exists */ + if (tail) { + *tail = '\0'; + } + + /* Match HUGEFILE_FMT, aka "%s/%smap_%d", + * which is defined in eal_filesystem.h + */ + str_underline = strrchr(tmp, '_'); + if (!str_underline) { + continue; + } + + str_start = str_underline - strlen("map"); + if (str_start < tmp) { + continue; + } + + if (sscanf(str_start, "map_%d", &huge_index) != 1) { + continue; + } + + if (idx >= max) { + SPDK_ERRLOG("Exceed maximum of %d\n", max); + rc = -ENOSPC; + goto out; + } + + if (idx > 0 && + strncmp(tmp, huges[idx - 1].path, PATH_MAX) == 0 && + v_start == huges[idx - 1].addr + huges[idx - 1].size) { + huges[idx - 1].size += (v_end - v_start); + continue; + } + + huges[idx].addr = v_start; + huges[idx].size = v_end - v_start; + snprintf(huges[idx].path, PATH_MAX, "%s", tmp); + idx++; + } + + rc = idx; +out: + fclose(f); + return rc; +} + +static int +prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[]) +{ + int i, num; + struct hugepage_file_info huges[VHOST_MEMORY_MAX_NREGIONS]; + + num = get_hugepage_file_info(huges, VHOST_MEMORY_MAX_NREGIONS); + if (num < 0) { + SPDK_ERRLOG("Failed to prepare memory for vhost-user\n"); + return num; + } + + for (i = 0; i < num; ++i) { + /* the memory regions are unaligned */ + msg->payload.memory.regions[i].guest_phys_addr = huges[i].addr; /* use vaddr! */ + msg->payload.memory.regions[i].userspace_addr = huges[i].addr; + msg->payload.memory.regions[i].memory_size = huges[i].size; + msg->payload.memory.regions[i].flags_padding = 0; + fds[i] = open(huges[i].path, O_RDWR); + } + + msg->payload.memory.nregions = num; + msg->payload.memory.padding = 0; + + return 0; +} + +static const char *const vhost_msg_strings[VHOST_USER_MAX] = { + [VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER", + [VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER", + [VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES", + [VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES", + [VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL", + [VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM", + [VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG", + [VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG", +}; + +static int +vhost_user_sock(struct virtio_user_dev *dev, + enum vhost_user_request req, + void *arg) +{ + struct vhost_user_msg msg; + struct vhost_vring_file *file = 0; + int need_reply = 0; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num = 0; + int i, len, rc; + int vhostfd = dev->vhostfd; + + SPDK_DEBUGLOG(SPDK_LOG_VIRTIO_USER, "sent message %d = %s\n", req, vhost_msg_strings[req]); + + msg.request = req; + msg.flags = VHOST_USER_VERSION; + msg.size = 0; + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + need_reply = 1; + break; + + case VHOST_USER_SET_FEATURES: + case VHOST_USER_SET_LOG_BASE: + case VHOST_USER_SET_PROTOCOL_FEATURES: + msg.payload.u64 = *((__u64 *)arg); + msg.size = sizeof(msg.payload.u64); + break; + + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + break; + + case VHOST_USER_SET_MEM_TABLE: + rc = prepare_vhost_memory_user(&msg, fds); + if (rc < 0) { + return rc; + } + fd_num = msg.payload.memory.nregions; + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); + msg.size += fd_num * sizeof(struct vhost_memory_region); + break; + + case VHOST_USER_SET_LOG_FD: + fds[fd_num++] = *((int *)arg); + break; + + case VHOST_USER_SET_VRING_NUM: + case VHOST_USER_SET_VRING_BASE: + case VHOST_USER_SET_VRING_ENABLE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + break; + + case VHOST_USER_GET_VRING_BASE: + memcpy(&msg.payload.state, arg, sizeof(msg.payload.state)); + msg.size = sizeof(msg.payload.state); + need_reply = 1; + break; + + case VHOST_USER_SET_VRING_ADDR: + memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr)); + msg.size = sizeof(msg.payload.addr); + break; + + case VHOST_USER_SET_VRING_KICK: + case VHOST_USER_SET_VRING_CALL: + case VHOST_USER_SET_VRING_ERR: + file = arg; + msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK; + msg.size = sizeof(msg.payload.u64); + if (file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; + } + break; + + case VHOST_USER_GET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + need_reply = 1; + break; + + case VHOST_USER_SET_CONFIG: + memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg)); + msg.size = sizeof(msg.payload.cfg); + break; + + default: + SPDK_ERRLOG("trying to send unknown msg\n"); + return -EINVAL; + } + + len = VHOST_USER_HDR_SIZE + msg.size; + rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num); + if (rc < 0) { + SPDK_ERRLOG("%s failed: %s\n", + vhost_msg_strings[req], spdk_strerror(-rc)); + return rc; + } + + if (req == VHOST_USER_SET_MEM_TABLE) + for (i = 0; i < fd_num; ++i) { + close(fds[i]); + } + + if (need_reply) { + rc = vhost_user_read(vhostfd, &msg); + if (rc < 0) { + SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc)); + return rc; + } + + if (req != msg.request) { + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EIO; + } + + switch (req) { + case VHOST_USER_GET_FEATURES: + case VHOST_USER_GET_PROTOCOL_FEATURES: + case VHOST_USER_GET_QUEUE_NUM: + if (msg.size != sizeof(msg.payload.u64)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + *((__u64 *)arg) = msg.payload.u64; + break; + case VHOST_USER_GET_VRING_BASE: + if (msg.size != sizeof(msg.payload.state)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.state, + sizeof(struct vhost_vring_state)); + break; + case VHOST_USER_GET_CONFIG: + if (msg.size != sizeof(msg.payload.cfg)) { + SPDK_WARNLOG("Received bad msg size\n"); + return -EIO; + } + memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg)); + break; + default: + SPDK_WARNLOG("Received unexpected msg type\n"); + return -EBADMSG; + } + } + + return 0; +} + +/** + * Set up environment to talk with a vhost user backend. + * + * @return + * - (-1) if fail; + * - (0) if succeed. + */ +static int +vhost_user_setup(struct virtio_user_dev *dev) +{ + int fd; + int flag; + struct sockaddr_un un; + ssize_t rc; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno)); + return -errno; + } + + flag = fcntl(fd, F_GETFD); + if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) { + SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno)); + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path); + if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) { + SPDK_ERRLOG("socket path too long\n"); + close(fd); + if (rc < 0) { + return -errno; + } else { + return -EINVAL; + } + } + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno)); + close(fd); + return -errno; + } + + dev->vhostfd = fd; + return 0; +} + +struct virtio_user_backend_ops ops_user = { + .setup = vhost_user_setup, + .send_request = vhost_user_sock, +}; + +SPDK_LOG_REGISTER_COMPONENT("virtio_user", SPDK_LOG_VIRTIO_USER) -- cgit v1.2.3