diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /src/spdk/lib/nvme | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/nvme')
26 files changed, 23768 insertions, 0 deletions
diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile new file mode 100644 index 000000000..1c02965f5 --- /dev/null +++ b/src/spdk/lib/nvme/Makefile @@ -0,0 +1,73 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +SO_VER := 4 +SO_MINOR := 0 + +C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \ + nvme_ns_ocssd_cmd.c nvme_tcp.c nvme_opal.c nvme_io_msg.c nvme_poll_group.c +C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c +C_SRCS-$(CONFIG_NVME_CUSE) += nvme_cuse.c + +LIBNAME = nvme +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +ifeq ($(CONFIG_NVME_CUSE),y) +# fuse requires to set _FILE_OFFSET_BITS to 64 bits even for 64 bit machines +CFLAGS += -D_FILE_OFFSET_BITS=64 +endif + +SPDK_MAP_FILE = $(abspath $(CURDIR)/spdk_nvme.map) + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c new file mode 100644 index 000000000..9393810a6 --- /dev/null +++ b/src/spdk/lib/nvme/nvme.c @@ -0,0 +1,1423 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "nvme_internal.h" +#include "nvme_io_msg.h" +#include "nvme_uevent.h" + +#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver" + +struct nvme_driver *g_spdk_nvme_driver; +pid_t g_spdk_nvme_pid; + +/* gross timeout of 180 seconds in milliseconds */ +static int g_nvme_driver_timeout_ms = 3 * 60 * 1000; + +/* Per-process attached controller list */ +static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs = + TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs); + +/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */ +static bool +nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE; +} + +void +nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_nvme_ctrlr *ctrlr) +{ + TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq); +} + +int +spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + nvme_ctrlr_proc_put_ref(ctrlr); + + if (nvme_ctrlr_get_ref_count(ctrlr) == 0) { + nvme_io_msg_ctrlr_detach(ctrlr); + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + nvme_ctrlr_destruct(ctrlr); + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return 0; +} + +void +nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_completion_poll_status *status = arg; + + if (status->timed_out) { + /* There is no routine waiting for the completion of this request, free allocated memory */ + free(status); + return; + } + + /* + * Copy status into the argument passed by the caller, so that + * the caller can check the status to determine if the + * the request passed or failed. + */ + memcpy(&status->cpl, cpl, sizeof(*cpl)); + status->done = true; +} + +/** + * Poll qpair for completions until a command completes. + * + * \param qpair queue to poll + * \param status completion status. The user must fill this structure with zeroes before calling + * this function + * \param robust_mutex optional robust mutex to lock while polling qpair + * + * \return 0 if command completed without error, + * -EIO if command completed with error, + * -ECANCELED if command is not completed due to transport/device error + * + * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback + * and status as the callback argument. + */ +int +nvme_wait_for_completion_robust_lock( + struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex) +{ + int rc; + + while (status->done == false) { + if (robust_mutex) { + nvme_robust_mutex_lock(robust_mutex); + } + + rc = spdk_nvme_qpair_process_completions(qpair, 0); + + if (robust_mutex) { + nvme_robust_mutex_unlock(robust_mutex); + } + + if (rc < 0) { + status->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + if (status->done == false) { + status->timed_out = true; + } + return -ECANCELED; + } + } + + return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0; +} + +int +nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status) +{ + return nvme_wait_for_completion_robust_lock(qpair, status, NULL); +} + +/** + * Poll qpair for completions until a command completes. + * + * \param qpair queue to poll + * \param status completion status. The user must fill this structure with zeroes before calling + * this function + * \param timeout_in_secs optional timeout + * + * \return 0 if command completed without error, + * -EIO if command completed with error, + * -ECANCELED if command is not completed due to transport/device error or time expired + * + * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback + * and status as the callback argument. + */ +int +nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + uint64_t timeout_in_secs) +{ + uint64_t timeout_tsc = 0; + int rc = 0; + + if (timeout_in_secs) { + timeout_tsc = spdk_get_ticks() + timeout_in_secs * spdk_get_ticks_hz(); + } + + while (status->done == false) { + rc = spdk_nvme_qpair_process_completions(qpair, 0); + + if (rc < 0) { + status->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + status->cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + break; + } + if (timeout_tsc && spdk_get_ticks() > timeout_tsc) { + break; + } + } + + if (status->done == false || rc < 0) { + if (status->done == false) { + status->timed_out = true; + } + return -ECANCELED; + } + + return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0; +} + +static void +nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = arg; + enum spdk_nvme_data_transfer xfer; + + if (req->user_buffer && req->payload_size) { + /* Copy back to the user buffer and free the contig buffer */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); + if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST || + xfer == SPDK_NVME_DATA_BIDIRECTIONAL) { + assert(req->pid == getpid()); + memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size); + } + + spdk_free(req->payload.contig_or_cb_arg); + } + + /* Call the user's original callback now that the buffer has been copied */ + req->user_cb_fn(req->user_cb_arg, cpl); +} + +/** + * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer. + * + * This is intended for use in non-fast-path functions (admin commands, reservations, etc.) + * where the overhead of a copy is not a problem. + */ +struct nvme_request * +nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, bool host_to_controller) +{ + struct nvme_request *req; + void *dma_buffer = NULL; + + if (buffer && payload_size) { + dma_buffer = spdk_zmalloc(payload_size, 4096, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (!dma_buffer) { + return NULL; + } + + if (host_to_controller) { + memcpy(dma_buffer, buffer, payload_size); + } + } + + req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete, + NULL); + if (!req) { + spdk_free(dma_buffer); + return NULL; + } + + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + req->user_buffer = buffer; + req->cb_arg = req; + + return req; +} + +/** + * Check if a request has exceeded the controller timeout. + * + * \param req request to check for timeout. + * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn) + * \param active_proc per-process data for the controller associated with req + * \param now_tick current time from spdk_get_ticks() + * \return 0 if requests submitted more recently than req should still be checked for timeouts, or + * 1 if requests newer than req need not be checked. + * + * The request's timeout callback will be called if needed; the caller is only responsible for + * calling this function on each outstanding request. + */ +int +nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, + uint64_t now_tick) +{ + struct spdk_nvme_qpair *qpair = req->qpair; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + assert(active_proc->timeout_cb_fn != NULL); + + if (req->timed_out || req->submit_tick == 0) { + return 0; + } + + if (req->pid != g_spdk_nvme_pid) { + return 0; + } + + if (nvme_qpair_is_admin_queue(qpair) && + req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + return 0; + } + + if (req->submit_tick + active_proc->timeout_ticks > now_tick) { + return 1; + } + + req->timed_out = true; + + /* + * We don't want to expose the admin queue to the user, + * so when we're timing out admin commands set the + * qpair to NULL. + */ + active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr, + nvme_qpair_is_admin_queue(qpair) ? NULL : qpair, + cid); + return 0; +} + +int +nvme_robust_mutex_init_shared(pthread_mutex_t *mtx) +{ + int rc = 0; + +#ifdef __FreeBSD__ + pthread_mutex_init(mtx, NULL); +#else + pthread_mutexattr_t attr; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); +#endif + + return rc; +} + +int +nvme_driver_init(void) +{ + static pthread_mutex_t g_init_mutex = PTHREAD_MUTEX_INITIALIZER; + int ret = 0; + /* Any socket ID */ + int socket_id = -1; + + /* Use a special process-private mutex to ensure the global + * nvme driver object (g_spdk_nvme_driver) gets initialized by + * only one thread. Once that object is established and its + * mutex is initialized, we can unlock this mutex and use that + * one instead. + */ + pthread_mutex_lock(&g_init_mutex); + + /* Each process needs its own pid. */ + g_spdk_nvme_pid = getpid(); + + /* + * Only one thread from one process will do this driver init work. + * The primary process will reserve the shared memory and do the + * initialization. + * The secondary process will lookup the existing reserved memory. + */ + if (spdk_process_is_primary()) { + /* The unique named memzone already reserved. */ + if (g_spdk_nvme_driver != NULL) { + pthread_mutex_unlock(&g_init_mutex); + return 0; + } else { + g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME, + sizeof(struct nvme_driver), socket_id, + SPDK_MEMZONE_NO_IOVA_CONTIG); + } + + if (g_spdk_nvme_driver == NULL) { + SPDK_ERRLOG("primary process failed to reserve memory\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + } else { + g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME); + + /* The unique named memzone already reserved by the primary process. */ + if (g_spdk_nvme_driver != NULL) { + int ms_waited = 0; + + /* Wait the nvme driver to get initialized. */ + while ((g_spdk_nvme_driver->initialized == false) && + (ms_waited < g_nvme_driver_timeout_ms)) { + ms_waited++; + nvme_delay(1000); /* delay 1ms */ + } + if (g_spdk_nvme_driver->initialized == false) { + SPDK_ERRLOG("timeout waiting for primary process to init\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + } else { + SPDK_ERRLOG("primary process is not started yet\n"); + pthread_mutex_unlock(&g_init_mutex); + return -1; + } + + pthread_mutex_unlock(&g_init_mutex); + return 0; + } + + /* + * At this moment, only one thread from the primary process will do + * the g_spdk_nvme_driver initialization + */ + assert(spdk_process_is_primary()); + + ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock); + if (ret != 0) { + SPDK_ERRLOG("failed to initialize mutex\n"); + spdk_memzone_free(SPDK_NVME_DRIVER_NAME); + pthread_mutex_unlock(&g_init_mutex); + return ret; + } + + /* The lock in the shared g_spdk_nvme_driver object is now ready to + * be used - so we can unlock the g_init_mutex here. + */ + pthread_mutex_unlock(&g_init_mutex); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + g_spdk_nvme_driver->initialized = false; + g_spdk_nvme_driver->hotplug_fd = nvme_uevent_connect(); + if (g_spdk_nvme_driver->hotplug_fd < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n"); + } + + TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs); + + spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id); + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ret; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +int +nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle) +{ + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_nvme_ctrlr_opts opts; + + assert(trid != NULL); + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts)); + + if (!probe_ctx->probe_cb || probe_ctx->probe_cb(probe_ctx->cb_ctx, trid, &opts)) { + ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid); + if (ctrlr) { + /* This ctrlr already exists. + * Increase the ref count before calling attach_cb() as the user may + * call nvme_detach() immediately. */ + nvme_ctrlr_proc_get_ref(ctrlr); + + if (probe_ctx->attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + return 0; + } + + ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle); + if (ctrlr == NULL) { + SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr); + return -1; + } + ctrlr->remove_cb = probe_ctx->remove_cb; + ctrlr->cb_ctx = probe_ctx->cb_ctx; + + if (ctrlr->quirks & NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE && + ctrlr->opts.io_queue_size == DEFAULT_IO_QUEUE_SIZE) { + /* If the user specifically set an IO queue size different than the + * default, use that value. Otherwise overwrite with the quirked value. + * This allows this quirk to be overridden when necessary. + * However, cap.mqes still needs to be respected. + */ + ctrlr->opts.io_queue_size = spdk_min(DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK, ctrlr->cap.bits.mqes + 1u); + } + + nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED); + TAILQ_INSERT_TAIL(&probe_ctx->init_ctrlrs, ctrlr, tailq); + return 0; + } + + return 1; +} + +static int +nvme_ctrlr_poll_internal(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + + rc = nvme_ctrlr_process_init(ctrlr); + + if (rc) { + /* Controller failed to initialize. */ + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr); + nvme_ctrlr_fail(ctrlr, false); + nvme_ctrlr_destruct(ctrlr); + return rc; + } + + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return 0; + } + + STAILQ_INIT(&ctrlr->io_producers); + + /* + * Controller has been initialized. + * Move it to the attached_ctrlrs list. + */ + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + if (nvme_ctrlr_shared(ctrlr)) { + TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq); + } else { + TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq); + } + + /* + * Increase the ref count before calling attach_cb() as the user may + * call nvme_detach() immediately. + */ + nvme_ctrlr_proc_get_ref(ctrlr); + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + if (probe_ctx->attach_cb) { + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + return 0; + } + + return 0; +} + +static int +nvme_init_controllers(struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + + while (true) { + rc = spdk_nvme_probe_poll_async(probe_ctx); + if (rc != -EAGAIN) { + return rc; + } + } + + return rc; +} + +/* This function must not be called while holding g_spdk_nvme_driver->lock */ +static struct spdk_nvme_ctrlr * +nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + ctrlr = nvme_get_ctrlr_by_trid_unsafe(trid); + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return ctrlr; +} + +/* This function must be called while holding g_spdk_nvme_driver->lock */ +struct spdk_nvme_ctrlr * +nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvme_ctrlr *ctrlr; + + /* Search per-process list */ + TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + /* Search multi-process shared list */ + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) { + return ctrlr; + } + } + + return NULL; +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +nvme_probe_internal(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + int rc; + struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp; + + spdk_nvme_trid_populate_transport(&probe_ctx->trid, probe_ctx->trid.trtype); + if (!spdk_nvme_transport_available_by_name(probe_ctx->trid.trstring)) { + SPDK_ERRLOG("NVMe trtype %u not available\n", probe_ctx->trid.trtype); + return -1; + } + + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + + rc = nvme_transport_ctrlr_scan(probe_ctx, direct_connect); + if (rc != 0) { + SPDK_ERRLOG("NVMe ctrlr scan failed\n"); + TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) { + TAILQ_REMOVE(&probe_ctx->init_ctrlrs, ctrlr, tailq); + nvme_transport_ctrlr_destruct(ctrlr); + } + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + return -1; + } + + /* + * Probe controllers on the shared_attached_ctrlrs list + */ + if (!spdk_process_is_primary() && (probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE)) { + TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) { + /* Do not attach other ctrlrs if user specify a valid trid */ + if ((strlen(probe_ctx->trid.traddr) != 0) && + (spdk_nvme_transport_id_compare(&probe_ctx->trid, &ctrlr->trid))) { + continue; + } + + /* Do not attach if we failed to initialize it in this process */ + if (nvme_ctrlr_get_current_process(ctrlr) == NULL) { + continue; + } + + nvme_ctrlr_proc_get_ref(ctrlr); + + /* + * Unlock while calling attach_cb() so the user can call other functions + * that may take the driver lock, like nvme_detach(). + */ + if (probe_ctx->attach_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + probe_ctx->attach_cb(probe_ctx->cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + + return 0; +} + +static void +nvme_probe_ctx_init(struct spdk_nvme_probe_ctx *probe_ctx, + const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + probe_ctx->trid = *trid; + probe_ctx->cb_ctx = cb_ctx; + probe_ctx->probe_cb = probe_cb; + probe_ctx->attach_cb = attach_cb; + probe_ctx->remove_cb = remove_cb; + TAILQ_INIT(&probe_ctx->init_ctrlrs); +} + +int +spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx, + spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + struct spdk_nvme_transport_id trid_pcie; + struct spdk_nvme_probe_ctx *probe_ctx; + + if (trid == NULL) { + memset(&trid_pcie, 0, sizeof(trid_pcie)); + spdk_nvme_trid_populate_transport(&trid_pcie, SPDK_NVME_TRANSPORT_PCIE); + trid = &trid_pcie; + } + + probe_ctx = spdk_nvme_probe_async(trid, cb_ctx, probe_cb, + attach_cb, remove_cb); + if (!probe_ctx) { + SPDK_ERRLOG("Create probe context failed\n"); + return -1; + } + + /* + * Keep going even if one or more nvme_attach() calls failed, + * but maintain the value of rc to signal errors when we return. + */ + return nvme_init_controllers(probe_ctx); +} + +static bool +nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_ctrlr_opts *opts) +{ + struct spdk_nvme_ctrlr_opts *requested_opts = cb_ctx; + + assert(requested_opts); + memcpy(opts, requested_opts, sizeof(*opts)); + + return true; +} + +static void +nvme_ctrlr_opts_init(struct spdk_nvme_ctrlr_opts *opts, + const struct spdk_nvme_ctrlr_opts *opts_user, + size_t opts_size_user) +{ + assert(opts); + assert(opts_user); + + spdk_nvme_ctrlr_get_default_ctrlr_opts(opts, opts_size_user); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= (opts->opts_size) + + if (FIELD_OK(num_io_queues)) { + opts->num_io_queues = opts_user->num_io_queues; + } + + if (FIELD_OK(use_cmb_sqs)) { + opts->use_cmb_sqs = opts_user->use_cmb_sqs; + } + + if (FIELD_OK(no_shn_notification)) { + opts->no_shn_notification = opts_user->no_shn_notification; + } + + if (FIELD_OK(arb_mechanism)) { + opts->arb_mechanism = opts_user->arb_mechanism; + } + + if (FIELD_OK(arbitration_burst)) { + opts->arbitration_burst = opts_user->arbitration_burst; + } + + if (FIELD_OK(low_priority_weight)) { + opts->low_priority_weight = opts_user->low_priority_weight; + } + + if (FIELD_OK(medium_priority_weight)) { + opts->medium_priority_weight = opts_user->medium_priority_weight; + } + + if (FIELD_OK(high_priority_weight)) { + opts->high_priority_weight = opts_user->high_priority_weight; + } + + if (FIELD_OK(keep_alive_timeout_ms)) { + opts->keep_alive_timeout_ms = opts_user->keep_alive_timeout_ms; + } + + if (FIELD_OK(transport_retry_count)) { + opts->transport_retry_count = opts_user->transport_retry_count; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = opts_user->io_queue_size; + } + + if (FIELD_OK(hostnqn)) { + memcpy(opts->hostnqn, opts_user->hostnqn, sizeof(opts_user->hostnqn)); + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = opts_user->io_queue_requests; + } + + if (FIELD_OK(src_addr)) { + memcpy(opts->src_addr, opts_user->src_addr, sizeof(opts_user->src_addr)); + } + + if (FIELD_OK(src_svcid)) { + memcpy(opts->src_svcid, opts_user->src_svcid, sizeof(opts_user->src_svcid)); + } + + if (FIELD_OK(host_id)) { + memcpy(opts->host_id, opts_user->host_id, sizeof(opts_user->host_id)); + } + if (FIELD_OK(extended_host_id)) { + memcpy(opts->extended_host_id, opts_user->extended_host_id, + sizeof(opts_user->extended_host_id)); + } + + if (FIELD_OK(command_set)) { + opts->command_set = opts_user->command_set; + } + + if (FIELD_OK(admin_timeout_ms)) { + opts->admin_timeout_ms = opts_user->admin_timeout_ms; + } + + if (FIELD_OK(header_digest)) { + opts->header_digest = opts_user->header_digest; + } + + if (FIELD_OK(data_digest)) { + opts->data_digest = opts_user->data_digest; + } + + if (FIELD_OK(disable_error_logging)) { + opts->disable_error_logging = opts_user->disable_error_logging; + } + + if (FIELD_OK(transport_ack_timeout)) { + opts->transport_ack_timeout = opts_user->transport_ack_timeout; + } + + if (FIELD_OK(admin_queue_size)) { + opts->admin_queue_size = opts_user->admin_queue_size; + } +#undef FIELD_OK +} + +struct spdk_nvme_ctrlr * +spdk_nvme_connect(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + int rc; + struct spdk_nvme_ctrlr *ctrlr = NULL; + struct spdk_nvme_probe_ctx *probe_ctx; + struct spdk_nvme_ctrlr_opts *opts_local_p = NULL; + struct spdk_nvme_ctrlr_opts opts_local; + + if (trid == NULL) { + SPDK_ERRLOG("No transport ID specified\n"); + return NULL; + } + + if (opts) { + opts_local_p = &opts_local; + nvme_ctrlr_opts_init(opts_local_p, opts, opts_size); + } + + probe_ctx = spdk_nvme_connect_async(trid, opts_local_p, NULL); + if (!probe_ctx) { + SPDK_ERRLOG("Create probe context failed\n"); + return NULL; + } + + rc = nvme_init_controllers(probe_ctx); + if (rc != 0) { + return NULL; + } + + ctrlr = nvme_get_ctrlr_by_trid(trid); + + return ctrlr; +} + +void +spdk_nvme_trid_populate_transport(struct spdk_nvme_transport_id *trid, + enum spdk_nvme_transport_type trtype) +{ + const char *trstring = ""; + + trid->trtype = trtype; + switch (trtype) { + case SPDK_NVME_TRANSPORT_FC: + trstring = SPDK_NVME_TRANSPORT_NAME_FC; + break; + case SPDK_NVME_TRANSPORT_PCIE: + trstring = SPDK_NVME_TRANSPORT_NAME_PCIE; + break; + case SPDK_NVME_TRANSPORT_RDMA: + trstring = SPDK_NVME_TRANSPORT_NAME_RDMA; + break; + case SPDK_NVME_TRANSPORT_TCP: + trstring = SPDK_NVME_TRANSPORT_NAME_TCP; + break; + case SPDK_NVME_TRANSPORT_CUSTOM: + default: + SPDK_ERRLOG("don't use this for custom transports\n"); + assert(0); + return; + } + snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring); +} + +int +spdk_nvme_transport_id_populate_trstring(struct spdk_nvme_transport_id *trid, const char *trstring) +{ + int len, i, rc; + + if (trstring == NULL) { + return -EINVAL; + } + + len = strnlen(trstring, SPDK_NVMF_TRSTRING_MAX_LEN); + if (len == SPDK_NVMF_TRSTRING_MAX_LEN) { + return -EINVAL; + } + + rc = snprintf(trid->trstring, SPDK_NVMF_TRSTRING_MAX_LEN, "%s", trstring); + if (rc < 0) { + return rc; + } + + /* cast official trstring to uppercase version of input. */ + for (i = 0; i < len; i++) { + trid->trstring[i] = toupper(trid->trstring[i]); + } + return 0; +} + +int +spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str) +{ + if (trtype == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "PCIe") == 0) { + *trtype = SPDK_NVME_TRANSPORT_PCIE; + } else if (strcasecmp(str, "RDMA") == 0) { + *trtype = SPDK_NVME_TRANSPORT_RDMA; + } else if (strcasecmp(str, "FC") == 0) { + *trtype = SPDK_NVME_TRANSPORT_FC; + } else if (strcasecmp(str, "TCP") == 0) { + *trtype = SPDK_NVME_TRANSPORT_TCP; + } else { + *trtype = SPDK_NVME_TRANSPORT_CUSTOM; + } + return 0; +} + +const char * +spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype) +{ + switch (trtype) { + case SPDK_NVME_TRANSPORT_PCIE: + return "PCIe"; + case SPDK_NVME_TRANSPORT_RDMA: + return "RDMA"; + case SPDK_NVME_TRANSPORT_FC: + return "FC"; + case SPDK_NVME_TRANSPORT_TCP: + return "TCP"; + case SPDK_NVME_TRANSPORT_CUSTOM: + return "CUSTOM"; + default: + return NULL; + } +} + +int +spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str) +{ + if (adrfam == NULL || str == NULL) { + return -EINVAL; + } + + if (strcasecmp(str, "IPv4") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV4; + } else if (strcasecmp(str, "IPv6") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IPV6; + } else if (strcasecmp(str, "IB") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_IB; + } else if (strcasecmp(str, "FC") == 0) { + *adrfam = SPDK_NVMF_ADRFAM_FC; + } else { + return -ENOENT; + } + return 0; +} + +const char * +spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam) +{ + switch (adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + return "IPv4"; + case SPDK_NVMF_ADRFAM_IPV6: + return "IPv6"; + case SPDK_NVMF_ADRFAM_IB: + return "IB"; + case SPDK_NVMF_ADRFAM_FC: + return "FC"; + default: + return NULL; + } +} + +static size_t +parse_next_key(const char **str, char *key, char *val, size_t key_buf_size, size_t val_buf_size) +{ + + const char *sep, *sep1; + const char *whitespace = " \t\n"; + size_t key_len, val_len; + + *str += strspn(*str, whitespace); + + sep = strchr(*str, ':'); + if (!sep) { + sep = strchr(*str, '='); + if (!sep) { + SPDK_ERRLOG("Key without ':' or '=' separator\n"); + return 0; + } + } else { + sep1 = strchr(*str, '='); + if ((sep1 != NULL) && (sep1 < sep)) { + sep = sep1; + } + } + + key_len = sep - *str; + if (key_len >= key_buf_size) { + SPDK_ERRLOG("Key length %zu greater than maximum allowed %zu\n", + key_len, key_buf_size - 1); + return 0; + } + + memcpy(key, *str, key_len); + key[key_len] = '\0'; + + *str += key_len + 1; /* Skip key: */ + val_len = strcspn(*str, whitespace); + if (val_len == 0) { + SPDK_ERRLOG("Key without value\n"); + return 0; + } + + if (val_len >= val_buf_size) { + SPDK_ERRLOG("Value length %zu greater than maximum allowed %zu\n", + val_len, val_buf_size - 1); + return 0; + } + + memcpy(val, *str, val_len); + val[val_len] = '\0'; + + *str += val_len; + + return val_len; +} + +int +spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str) +{ + size_t val_len; + char key[32]; + char val[1024]; + + if (trid == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + + val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse transport ID\n"); + return -EINVAL; + } + + if (strcasecmp(key, "trtype") == 0) { + if (spdk_nvme_transport_id_populate_trstring(trid, val) != 0) { + SPDK_ERRLOG("invalid transport '%s'\n", val); + return -EINVAL; + } + if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) { + SPDK_ERRLOG("Unknown trtype '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "adrfam") == 0) { + if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) { + SPDK_ERRLOG("Unknown adrfam '%s'\n", val); + return -EINVAL; + } + } else if (strcasecmp(key, "traddr") == 0) { + if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) { + SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRADDR_MAX_LEN); + return -EINVAL; + } + memcpy(trid->traddr, val, val_len + 1); + } else if (strcasecmp(key, "trsvcid") == 0) { + if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) { + SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRSVCID_MAX_LEN); + return -EINVAL; + } + memcpy(trid->trsvcid, val, val_len + 1); + } else if (strcasecmp(key, "priority") == 0) { + if (val_len > SPDK_NVMF_PRIORITY_MAX_LEN) { + SPDK_ERRLOG("priority length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_PRIORITY_MAX_LEN); + return -EINVAL; + } + trid->priority = spdk_strtol(val, 10); + } else if (strcasecmp(key, "subnqn") == 0) { + if (val_len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_NQN_MAX_LEN); + return -EINVAL; + } + memcpy(trid->subnqn, val, val_len + 1); + } else if (strcasecmp(key, "hostaddr") == 0) { + continue; + } else if (strcasecmp(key, "hostsvcid") == 0) { + continue; + } else if (strcasecmp(key, "ns") == 0) { + /* + * Special case. The namespace id parameter may + * optionally be passed in the transport id string + * for an SPDK application (e.g. nvme/perf) + * and additionally parsed therein to limit + * targeting a specific namespace. For this + * scenario, just silently ignore this key + * rather than letting it default to logging + * it as an invalid key. + */ + continue; + } else if (strcasecmp(key, "alt_traddr") == 0) { + /* + * Used by applications for enabling transport ID failover. + * Please see the case above for more information on custom parameters. + */ + continue; + } else { + SPDK_ERRLOG("Unknown transport ID key '%s'\n", key); + } + } + + return 0; +} + +int +spdk_nvme_host_id_parse(struct spdk_nvme_host_id *hostid, const char *str) +{ + + size_t key_size = 32; + size_t val_size = 1024; + size_t val_len; + char key[key_size]; + char val[val_size]; + + if (hostid == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + + val_len = parse_next_key(&str, key, val, key_size, val_size); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse host ID\n"); + return val_len; + } + + /* Ignore the rest of the options from the transport ID. */ + if (strcasecmp(key, "trtype") == 0) { + continue; + } else if (strcasecmp(key, "adrfam") == 0) { + continue; + } else if (strcasecmp(key, "traddr") == 0) { + continue; + } else if (strcasecmp(key, "trsvcid") == 0) { + continue; + } else if (strcasecmp(key, "subnqn") == 0) { + continue; + } else if (strcasecmp(key, "priority") == 0) { + continue; + } else if (strcasecmp(key, "ns") == 0) { + continue; + } else if (strcasecmp(key, "hostaddr") == 0) { + if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) { + SPDK_ERRLOG("hostaddr length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRADDR_MAX_LEN); + return -EINVAL; + } + memcpy(hostid->hostaddr, val, val_len + 1); + + } else if (strcasecmp(key, "hostsvcid") == 0) { + if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) { + SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n", + val_len, SPDK_NVMF_TRSVCID_MAX_LEN); + return -EINVAL; + } + memcpy(hostid->hostsvcid, val, val_len + 1); + } else { + SPDK_ERRLOG("Unknown transport ID key '%s'\n", key); + } + } + + return 0; +} + +static int +cmp_int(int a, int b) +{ + return a - b; +} + +int +spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1, + const struct spdk_nvme_transport_id *trid2) +{ + int cmp; + + if (trid1->trtype == SPDK_NVME_TRANSPORT_CUSTOM) { + cmp = strcasecmp(trid1->trstring, trid2->trstring); + } else { + cmp = cmp_int(trid1->trtype, trid2->trtype); + } + + if (cmp) { + return cmp; + } + + if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) { + struct spdk_pci_addr pci_addr1 = {}; + struct spdk_pci_addr pci_addr2 = {}; + + /* Normalize PCI addresses before comparing */ + if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 || + spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) { + return -1; + } + + /* PCIe transport ID only uses trtype and traddr */ + return spdk_pci_addr_compare(&pci_addr1, &pci_addr2); + } + + cmp = strcasecmp(trid1->traddr, trid2->traddr); + if (cmp) { + return cmp; + } + + cmp = cmp_int(trid1->adrfam, trid2->adrfam); + if (cmp) { + return cmp; + } + + cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid); + if (cmp) { + return cmp; + } + + cmp = strcmp(trid1->subnqn, trid2->subnqn); + if (cmp) { + return cmp; + } + + return 0; +} + +int +spdk_nvme_prchk_flags_parse(uint32_t *prchk_flags, const char *str) +{ + size_t val_len; + char key[32]; + char val[1024]; + + if (prchk_flags == NULL || str == NULL) { + return -EINVAL; + } + + while (*str != '\0') { + val_len = parse_next_key(&str, key, val, sizeof(key), sizeof(val)); + + if (val_len == 0) { + SPDK_ERRLOG("Failed to parse prchk\n"); + return -EINVAL; + } + + if (strcasecmp(key, "prchk") == 0) { + if (strcasestr(val, "reftag") != NULL) { + *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_REFTAG; + } + if (strcasestr(val, "guard") != NULL) { + *prchk_flags |= SPDK_NVME_IO_FLAGS_PRCHK_GUARD; + } + } else { + SPDK_ERRLOG("Unknown key '%s'\n", key); + return -EINVAL; + } + } + + return 0; +} + +const char * +spdk_nvme_prchk_flags_str(uint32_t prchk_flags) +{ + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_REFTAG) { + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) { + return "prchk:reftag|guard"; + } else { + return "prchk:reftag"; + } + } else { + if (prchk_flags & SPDK_NVME_IO_FLAGS_PRCHK_GUARD) { + return "prchk:guard"; + } else { + return NULL; + } + } +} + +struct spdk_nvme_probe_ctx * +spdk_nvme_probe_async(const struct spdk_nvme_transport_id *trid, + void *cb_ctx, + spdk_nvme_probe_cb probe_cb, + spdk_nvme_attach_cb attach_cb, + spdk_nvme_remove_cb remove_cb) +{ + int rc; + struct spdk_nvme_probe_ctx *probe_ctx; + + rc = nvme_driver_init(); + if (rc != 0) { + return NULL; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (!probe_ctx) { + return NULL; + } + + nvme_probe_ctx_init(probe_ctx, trid, cb_ctx, probe_cb, attach_cb, remove_cb); + rc = nvme_probe_internal(probe_ctx, false); + if (rc != 0) { + free(probe_ctx); + return NULL; + } + + return probe_ctx; +} + +int +spdk_nvme_probe_poll_async(struct spdk_nvme_probe_ctx *probe_ctx) +{ + int rc = 0; + struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp; + + if (!spdk_process_is_primary() && probe_ctx->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + free(probe_ctx); + return 0; + } + + TAILQ_FOREACH_SAFE(ctrlr, &probe_ctx->init_ctrlrs, tailq, ctrlr_tmp) { + rc = nvme_ctrlr_poll_internal(ctrlr, probe_ctx); + if (rc != 0) { + rc = -EIO; + break; + } + } + + if (rc != 0 || TAILQ_EMPTY(&probe_ctx->init_ctrlrs)) { + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + g_spdk_nvme_driver->initialized = true; + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + free(probe_ctx); + return rc; + } + + return -EAGAIN; +} + +struct spdk_nvme_probe_ctx * +spdk_nvme_connect_async(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + spdk_nvme_attach_cb attach_cb) +{ + int rc; + spdk_nvme_probe_cb probe_cb = NULL; + struct spdk_nvme_probe_ctx *probe_ctx; + + rc = nvme_driver_init(); + if (rc != 0) { + return NULL; + } + + probe_ctx = calloc(1, sizeof(*probe_ctx)); + if (!probe_ctx) { + return NULL; + } + + if (opts) { + probe_cb = nvme_connect_probe_cb; + } + + nvme_probe_ctx_init(probe_ctx, trid, (void *)opts, probe_cb, attach_cb, NULL); + rc = nvme_probe_internal(probe_ctx, true); + if (rc != 0) { + free(probe_ctx); + return NULL; + } + + return probe_ctx; +} + +SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME) diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c new file mode 100644 index 000000000..ced02e9bb --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr.c @@ -0,0 +1,3639 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvme_internal.h" +#include "nvme_io_msg.h" + +#include "spdk/env.h" +#include "spdk/string.h" + +struct nvme_active_ns_ctx; + +static void nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr); +static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer); +static void nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx); +static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns); +static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns); + +static int +nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + &cc->raw); +} + +static int +nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw), + &csts->raw); +} + +int +nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap) +{ + return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw), + &cap->raw); +} + +int +nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw), + &vs->raw); +} + +static int +nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc) +{ + return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc->raw); +} + +int +nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz) +{ + return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), + &cmbsz->raw); +} + +/* When the field in spdk_nvme_ctrlr_opts are changed and you change this function, please + * also update the nvme_ctrl_opts_init function in nvme_ctrlr.c + */ +void +spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size) +{ + char host_id_str[SPDK_UUID_STRING_LEN]; + + assert(opts); + + opts->opts_size = opts_size; + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(num_io_queues)) { + opts->num_io_queues = DEFAULT_MAX_IO_QUEUES; + } + + if (FIELD_OK(use_cmb_sqs)) { + opts->use_cmb_sqs = true; + } + + if (FIELD_OK(no_shn_notification)) { + opts->no_shn_notification = false; + } + + if (FIELD_OK(arb_mechanism)) { + opts->arb_mechanism = SPDK_NVME_CC_AMS_RR; + } + + if (FIELD_OK(arbitration_burst)) { + opts->arbitration_burst = 0; + } + + if (FIELD_OK(low_priority_weight)) { + opts->low_priority_weight = 0; + } + + if (FIELD_OK(medium_priority_weight)) { + opts->medium_priority_weight = 0; + } + + if (FIELD_OK(high_priority_weight)) { + opts->high_priority_weight = 0; + } + + if (FIELD_OK(keep_alive_timeout_ms)) { + opts->keep_alive_timeout_ms = MIN_KEEP_ALIVE_TIMEOUT_IN_MS; + } + + if (FIELD_OK(transport_retry_count)) { + opts->transport_retry_count = SPDK_NVME_DEFAULT_RETRY_COUNT; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE; + } + + if (nvme_driver_init() == 0) { + if (FIELD_OK(hostnqn)) { + spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str), + &g_spdk_nvme_driver->default_extended_host_id); + snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str); + } + + if (FIELD_OK(extended_host_id)) { + memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id, + sizeof(opts->extended_host_id)); + } + + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS; + } + + if (FIELD_OK(src_addr)) { + memset(opts->src_addr, 0, sizeof(opts->src_addr)); + } + + if (FIELD_OK(src_svcid)) { + memset(opts->src_svcid, 0, sizeof(opts->src_svcid)); + } + + if (FIELD_OK(host_id)) { + memset(opts->host_id, 0, sizeof(opts->host_id)); + } + + if (FIELD_OK(command_set)) { + opts->command_set = SPDK_NVME_CC_CSS_NVM; + } + + if (FIELD_OK(admin_timeout_ms)) { + opts->admin_timeout_ms = NVME_MAX_ADMIN_TIMEOUT_IN_SECS * 1000; + } + + if (FIELD_OK(header_digest)) { + opts->header_digest = false; + } + + if (FIELD_OK(data_digest)) { + opts->data_digest = false; + } + + if (FIELD_OK(disable_error_logging)) { + opts->disable_error_logging = false; + } + + if (FIELD_OK(transport_ack_timeout)) { + opts->transport_ack_timeout = SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT; + } + + if (FIELD_OK(admin_queue_size)) { + opts->admin_queue_size = DEFAULT_ADMIN_QUEUE_SIZE; + } +#undef FIELD_OK +} + +/** + * This function will be called when the process allocates the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq); + qpair->active_proc = active_proc; + } +} + +/** + * This function will be called when the process frees the IO qpair. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_qpair *active_qpair, *tmp_qpair; + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (!active_proc) { + return; + } + + TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs, + per_process_tailq, tmp_qpair) { + if (active_qpair == qpair) { + TAILQ_REMOVE(&active_proc->allocated_io_qpairs, + active_qpair, per_process_tailq); + + break; + } + } +} + +void +spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_io_qpair_opts *opts, + size_t opts_size) +{ + assert(ctrlr); + + assert(opts); + + memset(opts, 0, opts_size); + +#define FIELD_OK(field) \ + offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size + + if (FIELD_OK(qprio)) { + opts->qprio = SPDK_NVME_QPRIO_URGENT; + } + + if (FIELD_OK(io_queue_size)) { + opts->io_queue_size = ctrlr->opts.io_queue_size; + } + + if (FIELD_OK(io_queue_requests)) { + opts->io_queue_requests = ctrlr->opts.io_queue_requests; + } + + if (FIELD_OK(delay_cmd_submit)) { + opts->delay_cmd_submit = false; + } + + if (FIELD_OK(sq.vaddr)) { + opts->sq.vaddr = NULL; + } + + if (FIELD_OK(sq.paddr)) { + opts->sq.paddr = 0; + } + + if (FIELD_OK(sq.buffer_size)) { + opts->sq.buffer_size = 0; + } + + if (FIELD_OK(cq.vaddr)) { + opts->cq.vaddr = NULL; + } + + if (FIELD_OK(cq.paddr)) { + opts->cq.paddr = 0; + } + + if (FIELD_OK(cq.buffer_size)) { + opts->cq.buffer_size = 0; + } + + if (FIELD_OK(create_only)) { + opts->create_only = false; + } + +#undef FIELD_OK +} + +static struct spdk_nvme_qpair * +nvme_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_io_qpair_opts *opts) +{ + uint32_t qid; + struct spdk_nvme_qpair *qpair; + union spdk_nvme_cc_register cc; + + if (!ctrlr) { + return NULL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc failed\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + if (opts->qprio & ~SPDK_NVME_CREATE_IO_SQ_QPRIO_MASK) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + /* + * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the + * default round robin arbitration method. + */ + if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts->qprio != SPDK_NVME_QPRIO_URGENT)) { + SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + /* + * Get the first available I/O queue ID. + */ + qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1); + if (qid > ctrlr->opts.num_io_queues) { + SPDK_ERRLOG("No free I/O queue IDs\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, opts); + if (qpair == NULL) { + SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n"); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return NULL; + } + + spdk_bit_array_clear(ctrlr->free_io_qids, qid); + TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq); + + nvme_ctrlr_proc_add_io_qpair(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return qpair; +} + +int +spdk_nvme_ctrlr_connect_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + int rc; + + if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) { + return -EISCONN; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) { + spdk_delay_us(100); + } + + return rc; +} + +void +spdk_nvme_ctrlr_disconnect_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +struct spdk_nvme_qpair * +spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_io_qpair_opts *user_opts, + size_t opts_size) +{ + + struct spdk_nvme_qpair *qpair; + struct spdk_nvme_io_qpair_opts opts; + int rc; + + /* + * Get the default options, then overwrite them with the user-provided options + * up to opts_size. + * + * This allows for extensions of the opts structure without breaking + * ABI compatibility. + */ + spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + + /* If user passes buffers, make sure they're big enough for the requested queue size */ + if (opts.sq.vaddr) { + if (opts.sq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))) { + SPDK_ERRLOG("sq buffer size %lx is too small for sq size %lx\n", + opts.sq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cmd))); + return NULL; + } + } + if (opts.cq.vaddr) { + if (opts.cq.buffer_size < (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))) { + SPDK_ERRLOG("cq buffer size %lx is too small for cq size %lx\n", + opts.cq.buffer_size, (opts.io_queue_size * sizeof(struct spdk_nvme_cpl))); + return NULL; + } + } + } + + qpair = nvme_ctrlr_create_io_qpair(ctrlr, &opts); + + if (qpair == NULL || opts.create_only == true) { + return qpair; + } + + rc = spdk_nvme_ctrlr_connect_io_qpair(ctrlr, qpair); + if (rc != 0) { + SPDK_ERRLOG("nvme_transport_ctrlr_connect_io_qpair() failed\n"); + nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair); + return NULL; + } + + return qpair; +} + +int +spdk_nvme_ctrlr_reconnect_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr; + enum nvme_qpair_state qpair_state; + int rc; + + assert(qpair != NULL); + assert(nvme_qpair_is_admin_queue(qpair) == false); + assert(qpair->ctrlr != NULL); + + ctrlr = qpair->ctrlr; + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + qpair_state = nvme_qpair_get_state(qpair); + + if (ctrlr->is_removed) { + rc = -ENODEV; + goto out; + } + + if (ctrlr->is_resetting || qpair_state == NVME_QPAIR_DISCONNECTING) { + rc = -EAGAIN; + goto out; + } + + if (ctrlr->is_failed || qpair_state == NVME_QPAIR_DESTROYING) { + rc = -ENXIO; + goto out; + } + + if (qpair_state != NVME_QPAIR_DISCONNECTED) { + rc = 0; + goto out; + } + + rc = nvme_transport_ctrlr_connect_qpair(ctrlr, qpair); + if (rc) { + rc = -EAGAIN; + goto out; + } + +out: + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +spdk_nvme_qp_failure_reason +spdk_nvme_ctrlr_get_admin_qp_failure_reason(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->adminq->transport_failure_reason; +} + +/* + * This internal function will attempt to take the controller + * lock before calling disconnect on a controller qpair. + * Functions already holding the controller lock should + * call nvme_transport_ctrlr_disconnect_qpair directly. + */ +void +nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + assert(ctrlr != NULL); + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +int +spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr; + + if (qpair == NULL) { + return 0; + } + + ctrlr = qpair->ctrlr; + + if (qpair->in_completion_context) { + /* + * There are many cases where it is convenient to delete an io qpair in the context + * of that qpair's completion routine. To handle this properly, set a flag here + * so that the completion routine will perform an actual delete after the context + * unwinds. + */ + qpair->delete_after_completion_context = 1; + return 0; + } + + if (qpair->poll_group && qpair->poll_group->in_completion_context) { + /* Same as above, but in a poll group. */ + qpair->poll_group->num_qpairs_to_delete++; + qpair->delete_after_completion_context = 1; + return 0; + } + + if (qpair->poll_group) { + spdk_nvme_poll_group_remove(qpair->poll_group->group, qpair); + } + + /* Do not retry. */ + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + + /* In the multi-process case, a process may call this function on a foreign + * I/O qpair (i.e. one that this process did not create) when that qpairs process + * exits unexpectedly. In that case, we must not try to abort any reqs associated + * with that qpair, since the callbacks will also be foreign to this process. + */ + if (qpair->active_proc == nvme_ctrlr_get_current_process(ctrlr)) { + nvme_qpair_abort_reqs(qpair, 1); + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_proc_remove_io_qpair(qpair); + + TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq); + spdk_bit_array_set(ctrlr->free_io_qids, qpair->id); + + if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -1; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return 0; +} + +static void +nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_intel_log_page_directory *log_page_directory) +{ + if (log_page_directory == NULL) { + return; + } + + if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) { + return; + } + + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true; + + if (log_page_directory->read_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true; + } + if (log_page_directory->write_latency_log_len || + (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true; + } + if (log_page_directory->temperature_statistics_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true; + } + if (log_page_directory->smart_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true; + } + if (log_page_directory->marketing_description_log_len) { + ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true; + } +} + +static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + struct nvme_completion_poll_status *status; + struct spdk_nvme_intel_log_page_directory *log_page_directory; + + log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory), + 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + if (log_page_directory == NULL) { + SPDK_ERRLOG("could not allocate log_page_directory\n"); + return -ENXIO; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + spdk_free(log_page_directory); + return -ENOMEM; + } + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY, + SPDK_NVME_GLOBAL_NS_TAG, log_page_directory, + sizeof(struct spdk_nvme_intel_log_page_directory), + 0, nvme_completion_poll_cb, status); + if (rc != 0) { + spdk_free(log_page_directory); + free(status); + return rc; + } + + if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, + ctrlr->opts.admin_timeout_ms / 1000)) { + spdk_free(log_page_directory); + SPDK_WARNLOG("Intel log pages not supported on Intel drive!\n"); + if (!status->timed_out) { + free(status); + } + return 0; + } + + nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory); + spdk_free(log_page_directory); + free(status); + return 0; +} + +static int +nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + + memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported)); + /* Mandatory pages */ + ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true; + ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true; + if (ctrlr->cdata.lpa.celp) { + ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) { + rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr); + } + + return rc; +} + +static void +nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true; + ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true; +} + +static void +nvme_ctrlr_set_arbitration_feature(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t cdw11; + struct nvme_completion_poll_status *status; + + if (ctrlr->opts.arbitration_burst == 0) { + return; + } + + if (ctrlr->opts.arbitration_burst > 7) { + SPDK_WARNLOG("Valid arbitration burst values is from 0-7\n"); + return; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return; + } + + cdw11 = ctrlr->opts.arbitration_burst; + + if (spdk_nvme_ctrlr_get_flags(ctrlr) & SPDK_NVME_CTRLR_WRR_SUPPORTED) { + cdw11 |= (uint32_t)ctrlr->opts.low_priority_weight << 8; + cdw11 |= (uint32_t)ctrlr->opts.medium_priority_weight << 16; + cdw11 |= (uint32_t)ctrlr->opts.high_priority_weight << 24; + } + + if (spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ARBITRATION, + cdw11, 0, NULL, 0, + nvme_completion_poll_cb, status) < 0) { + SPDK_ERRLOG("Set arbitration feature failed\n"); + free(status); + return; + } + + if (nvme_wait_for_completion_timeout(ctrlr->adminq, status, + ctrlr->opts.admin_timeout_ms / 1000)) { + SPDK_ERRLOG("Timeout to set arbitration feature\n"); + } + + if (!status->timed_out) { + free(status); + } +} + +static void +nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr) +{ + memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported)); + /* Mandatory features */ + ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true; + ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true; + /* Optional features */ + if (ctrlr->cdata.vwc.present) { + ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true; + } + if (ctrlr->cdata.apsta.supported) { + ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true; + } + if (ctrlr->cdata.hmpre) { + ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true; + } + if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) { + nvme_ctrlr_set_intel_supported_features(ctrlr); + } + + nvme_ctrlr_set_arbitration_feature(ctrlr); +} + +bool +spdk_nvme_ctrlr_is_failed(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->is_failed; +} + +void +nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove) +{ + /* + * Set the flag here and leave the work failure of qpairs to + * spdk_nvme_qpair_process_completions(). + */ + if (hot_remove) { + ctrlr->is_removed = true; + } + ctrlr->is_failed = true; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); + SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr); +} + +/** + * This public API function will try to take the controller lock. + * Any private functions being called from a thread already holding + * the ctrlr lock should call nvme_ctrlr_fail directly. + */ +void +spdk_nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_ctrlr_fail(ctrlr, false); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +static void +nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ms_waited = 0; + uint32_t shutdown_timeout_ms; + + if (ctrlr->is_removed) { + return; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("ctrlr %s get_cc() failed\n", ctrlr->trid.traddr); + return; + } + + cc.bits.shn = SPDK_NVME_SHN_NORMAL; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("ctrlr %s set_cc() failed\n", ctrlr->trid.traddr); + return; + } + + /* + * The NVMe specification defines RTD3E to be the time between + * setting SHN = 1 until the controller will set SHST = 10b. + * If the device doesn't report RTD3 entry latency, or if it + * reports RTD3 entry latency less than 10 seconds, pick + * 10 seconds as a reasonable amount of time to + * wait before proceeding. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e); + shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000; + shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms); + + do { + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + SPDK_ERRLOG("ctrlr %s get_csts() failed\n", ctrlr->trid.traddr); + return; + } + + if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ctrlr %s shutdown complete in %u milliseconds\n", + ctrlr->trid.traddr, ms_waited); + return; + } + + nvme_delay(1000); + ms_waited++; + } while (ms_waited < shutdown_timeout_ms); + + SPDK_ERRLOG("ctrlr %s did not shutdown within %u milliseconds\n", + ctrlr->trid.traddr, shutdown_timeout_ms); + if (ctrlr->quirks & NVME_QUIRK_SHST_COMPLETE) { + SPDK_ERRLOG("likely due to shutdown handling in the VMWare emulated NVMe SSD\n"); + } +} + +static int +nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + int rc; + + rc = nvme_transport_ctrlr_enable(ctrlr); + if (rc != 0) { + SPDK_ERRLOG("transport ctrlr_enable failed\n"); + return rc; + } + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return -EIO; + } + + if (cc.bits.en != 0) { + SPDK_ERRLOG("called with CC.EN = 1\n"); + return -EINVAL; + } + + cc.bits.en = 1; + cc.bits.css = 0; + cc.bits.shn = 0; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + + /* Page size is 2 ^ (12 + mps). */ + cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12; + + if (ctrlr->cap.bits.css == 0) { + SPDK_INFOLOG(SPDK_LOG_NVME, + "Drive reports no command sets supported. Assuming NVM is supported.\n"); + ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM; + } + + if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n", + ctrlr->opts.command_set, ctrlr->cap.bits.css); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Falling back to NVM. Assuming NVM is supported.\n"); + ctrlr->opts.command_set = SPDK_NVME_CC_CSS_NVM; + } + + cc.bits.css = ctrlr->opts.command_set; + + switch (ctrlr->opts.arb_mechanism) { + case SPDK_NVME_CC_AMS_RR: + break; + case SPDK_NVME_CC_AMS_WRR: + if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + case SPDK_NVME_CC_AMS_VS: + if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) { + break; + } + return -EINVAL; + default: + return -EINVAL; + } + + cc.bits.ams = ctrlr->opts.arb_mechanism; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + + return 0; +} + +static int +nvme_ctrlr_disable(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + + if (nvme_ctrlr_get_cc(ctrlr, &cc)) { + SPDK_ERRLOG("get_cc() failed\n"); + return -EIO; + } + + if (cc.bits.en == 0) { + return 0; + } + + cc.bits.en = 0; + + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + + return 0; +} + +#ifdef DEBUG +static const char * +nvme_ctrlr_state_string(enum nvme_ctrlr_state state) +{ + switch (state) { + case NVME_CTRLR_STATE_INIT_DELAY: + return "delay init"; + case NVME_CTRLR_STATE_INIT: + return "init"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + return "disable and wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + return "disable and wait for CSTS.RDY = 0"; + case NVME_CTRLR_STATE_ENABLE: + return "enable controller by writing CC.EN = 1"; + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + return "wait for CSTS.RDY = 1"; + case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE: + return "reset admin queue"; + case NVME_CTRLR_STATE_IDENTIFY: + return "identify controller"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + return "wait for identify controller"; + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + return "set number of queues"; + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + return "wait for set number of queues"; + case NVME_CTRLR_STATE_CONSTRUCT_NS: + return "construct namespaces"; + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + return "identify active ns"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS: + return "wait for identify active ns"; + case NVME_CTRLR_STATE_IDENTIFY_NS: + return "identify ns"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + return "wait for identify ns"; + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + return "identify namespace id descriptors"; + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + return "wait for identify namespace id descriptors"; + case NVME_CTRLR_STATE_CONFIGURE_AER: + return "configure AER"; + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + return "wait for configure aer"; + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + return "set supported log pages"; + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + return "set supported features"; + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + return "set doorbell buffer config"; + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + return "wait for doorbell buffer config"; + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + return "set keep alive timeout"; + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + return "wait for set keep alive timeout"; + case NVME_CTRLR_STATE_SET_HOST_ID: + return "set host ID"; + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + return "wait for set host ID"; + case NVME_CTRLR_STATE_READY: + return "ready"; + case NVME_CTRLR_STATE_ERROR: + return "error"; + } + return "unknown"; +}; +#endif /* DEBUG */ + +static void +nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state, + uint64_t timeout_in_ms) +{ + uint64_t ticks_per_ms, timeout_in_ticks, now_ticks; + + ctrlr->state = state; + if (timeout_in_ms == NVME_TIMEOUT_INFINITE) { + goto inf; + } + + ticks_per_ms = spdk_get_ticks_hz() / 1000; + if (timeout_in_ms > UINT64_MAX / ticks_per_ms) { + SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n"); + goto inf; + } + + now_ticks = spdk_get_ticks(); + timeout_in_ticks = timeout_in_ms * ticks_per_ms; + if (timeout_in_ticks > UINT64_MAX - now_ticks) { + SPDK_ERRLOG("Specified timeout would cause integer overflow. Defaulting to no timeout.\n"); + goto inf; + } + + ctrlr->state_timeout_tsc = timeout_in_ticks + now_ticks; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n", + nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms); + return; +inf: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n", + nvme_ctrlr_state_string(ctrlr->state)); + ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE; +} + +static void +nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->shadow_doorbell) { + spdk_free(ctrlr->shadow_doorbell); + ctrlr->shadow_doorbell = NULL; + } + + if (ctrlr->eventidx) { + spdk_free(ctrlr->eventidx); + ctrlr->eventidx = NULL; + } +} + +static void +nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_WARNLOG("Doorbell buffer config failed\n"); + } else { + SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n", + ctrlr->trid.traddr); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint64_t prp1, prp2, len; + + if (!ctrlr->cdata.oacs.doorbell_buffer_config) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + /* only 1 page size for doorbell buffer */ + ctrlr->shadow_doorbell = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (ctrlr->shadow_doorbell == NULL) { + rc = -ENOMEM; + goto error; + } + + len = ctrlr->page_size; + prp1 = spdk_vtophys(ctrlr->shadow_doorbell, &len); + if (prp1 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) { + rc = -EFAULT; + goto error; + } + + ctrlr->eventidx = spdk_zmalloc(ctrlr->page_size, ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, + SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (ctrlr->eventidx == NULL) { + rc = -ENOMEM; + goto error; + } + + len = ctrlr->page_size; + prp2 = spdk_vtophys(ctrlr->eventidx, &len); + if (prp2 == SPDK_VTOPHYS_ERROR || len != ctrlr->page_size) { + rc = -EFAULT; + goto error; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2, + nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr); + if (rc != 0) { + goto error; + } + + return 0; + +error: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + nvme_ctrlr_free_doorbell_buffer(ctrlr); + return rc; +} + +static void +nvme_ctrlr_abort_queued_aborts(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_request *req, *tmp; + struct spdk_nvme_cpl cpl = {}; + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, &cpl); + nvme_free_request(req); + } +} + +int +spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + struct spdk_nvme_qpair *qpair; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->is_resetting || ctrlr->is_removed) { + /* + * Controller is already resetting or has been removed. Return + * immediately since there is no need to kick off another + * reset in these cases. + */ + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return ctrlr->is_resetting ? 0 : -ENXIO; + } + + ctrlr->is_resetting = true; + ctrlr->is_failed = false; + + SPDK_NOTICELOG("resetting controller\n"); + + /* Abort all of the queued abort requests */ + nvme_ctrlr_abort_queued_aborts(ctrlr); + + nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); + + /* Disable all queues before disabling the controller hardware. */ + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + } + + ctrlr->adminq->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, ctrlr->adminq); + if (nvme_transport_ctrlr_connect_qpair(ctrlr, ctrlr->adminq) != 0) { + SPDK_ERRLOG("Controller reinitialization failed.\n"); + rc = -1; + goto out; + } + + /* Doorbell buffer config is invalid during reset */ + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + /* Set the state back to INIT to cause a full hardware reset. */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + + nvme_qpair_set_state(ctrlr->adminq, NVME_QPAIR_ENABLED); + while (ctrlr->state != NVME_CTRLR_STATE_READY) { + if (nvme_ctrlr_process_init(ctrlr) != 0) { + SPDK_ERRLOG("controller reinitialization failed\n"); + rc = -1; + break; + } + } + + /* + * For PCIe controllers, the memory locations of the tranpsort qpair + * don't change when the controller is reset. They simply need to be + * re-enabled with admin commands to the controller. For fabric + * controllers we need to disconnect and reconnect the qpair on its + * own thread outside of the context of the reset. + */ + if (rc == 0 && ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + /* Reinitialize qpairs */ + TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) { + if (nvme_transport_ctrlr_connect_qpair(ctrlr, qpair) != 0) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + rc = -1; + continue; + } + } + } + +out: + if (rc) { + nvme_ctrlr_fail(ctrlr, false); + } + ctrlr->is_resetting = false; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (!ctrlr->cdata.oaes.ns_attribute_notices) { + /* + * If controller doesn't support ns_attribute_notices and + * namespace attributes change (e.g. number of namespaces) + * we need to update system handling device reset. + */ + nvme_io_msg_ctrlr_update(ctrlr); + } + + return rc; +} + +int +spdk_nvme_ctrlr_set_trid(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_transport_id *trid) +{ + int rc = 0; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->is_failed == false) { + rc = -EPERM; + goto out; + } + + if (trid->trtype != ctrlr->trid.trtype) { + rc = -EINVAL; + goto out; + } + + if (strncmp(trid->subnqn, ctrlr->trid.subnqn, SPDK_NVMF_NQN_MAX_LEN)) { + rc = -EINVAL; + goto out; + } + + ctrlr->trid = *trid; + +out: + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +static void +nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* + * Use MDTS to ensure our default max_xfer_size doesn't exceed what the + * controller supports. + */ + ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size); + if (ctrlr->cdata.mdts > 0) { + ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size, + ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts))); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid); + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + ctrlr->cntlid = ctrlr->cdata.cntlid; + } else { + /* + * Fabrics controllers should already have CNTLID from the Connect command. + * + * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data, + * trust the one from Connect. + */ + if (ctrlr->cntlid != ctrlr->cdata.cntlid) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n", + ctrlr->cdata.cntlid, ctrlr->cntlid); + } + } + + if (ctrlr->cdata.sgls.supported) { + assert(ctrlr->cdata.sgls.supported != 0x3); + ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED; + if (ctrlr->cdata.sgls.supported == 0x2) { + ctrlr->flags |= SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT; + } + /* + * Use MSDBD to ensure our max_sges doesn't exceed what the + * controller supports. + */ + ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr); + if (ctrlr->cdata.nvmf_specific.msdbd != 0) { + ctrlr->max_sges = spdk_min(ctrlr->cdata.nvmf_specific.msdbd, ctrlr->max_sges); + } else { + /* A value 0 indicates no limit. */ + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_sges %u\n", ctrlr->max_sges); + } + + if (ctrlr->cdata.oacs.security && !(ctrlr->quirks & NVME_QUIRK_OACS_SECURITY)) { + ctrlr->flags |= SPDK_NVME_CTRLR_SECURITY_SEND_RECV_SUPPORTED; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "fuses compare and write: %d\n", ctrlr->cdata.fuses.compare_and_write); + if (ctrlr->cdata.fuses.compare_and_write) { + ctrlr->flags |= SPDK_NVME_CTRLR_COMPARE_AND_WRITE_SUPPORTED; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &ctrlr->cdata, sizeof(ctrlr->cdata), + nvme_ctrlr_identify_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +enum nvme_active_ns_state { + NVME_ACTIVE_NS_STATE_IDLE, + NVME_ACTIVE_NS_STATE_PROCESSING, + NVME_ACTIVE_NS_STATE_DONE, + NVME_ACTIVE_NS_STATE_ERROR +}; + +typedef void (*nvme_active_ns_ctx_deleter)(struct nvme_active_ns_ctx *); + +struct nvme_active_ns_ctx { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t page; + uint32_t num_pages; + uint32_t next_nsid; + uint32_t *new_ns_list; + nvme_active_ns_ctx_deleter deleter; + + enum nvme_active_ns_state state; +}; + +static struct nvme_active_ns_ctx * +nvme_active_ns_ctx_create(struct spdk_nvme_ctrlr *ctrlr, nvme_active_ns_ctx_deleter deleter) +{ + struct nvme_active_ns_ctx *ctx; + uint32_t num_pages = 0; + uint32_t *new_ns_list = NULL; + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + SPDK_ERRLOG("Failed to allocate nvme_active_ns_ctx!\n"); + return NULL; + } + + if (ctrlr->num_ns) { + /* The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list) */ + num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1; + new_ns_list = spdk_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size, + NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA | SPDK_MALLOC_SHARE); + if (!new_ns_list) { + SPDK_ERRLOG("Failed to allocate active_ns_list!\n"); + free(ctx); + return NULL; + } + } + + ctx->num_pages = num_pages; + ctx->new_ns_list = new_ns_list; + ctx->ctrlr = ctrlr; + ctx->deleter = deleter; + + return ctx; +} + +static void +nvme_active_ns_ctx_destroy(struct nvme_active_ns_ctx *ctx) +{ + spdk_free(ctx->new_ns_list); + free(ctx); +} + +static void +nvme_ctrlr_identify_active_ns_swap(struct spdk_nvme_ctrlr *ctrlr, uint32_t **new_ns_list) +{ + spdk_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = *new_ns_list; + *new_ns_list = NULL; +} + +static void +nvme_ctrlr_identify_active_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_active_ns_ctx *ctx = arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + goto out; + } + + ctx->next_nsid = ctx->new_ns_list[1024 * ctx->page + 1023]; + if (ctx->next_nsid == 0 || ++ctx->page == ctx->num_pages) { + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + nvme_ctrlr_identify_active_ns_async(ctx); + return; + +out: + if (ctx->deleter) { + ctx->deleter(ctx); + } +} + +static void +nvme_ctrlr_identify_active_ns_async(struct nvme_active_ns_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr; + uint32_t i; + int rc; + + if (ctrlr->num_ns == 0) { + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + /* + * If controller doesn't support active ns list CNS 0x02 dummy up + * an active ns list, i.e. all namespaces report as active + */ + if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 1, 0) || ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS) { + for (i = 0; i < ctrlr->num_ns; i++) { + ctx->new_ns_list[i] = i + 1; + } + + ctx->state = NVME_ACTIVE_NS_STATE_DONE; + goto out; + } + + ctx->state = NVME_ACTIVE_NS_STATE_PROCESSING; + rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, ctx->next_nsid, + &ctx->new_ns_list[1024 * ctx->page], sizeof(struct spdk_nvme_ns_list), + nvme_ctrlr_identify_active_ns_async_done, ctx); + if (rc != 0) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + goto out; + } + + return; + +out: + if (ctx->deleter) { + ctx->deleter(ctx); + } +} + +static void +_nvme_active_ns_ctx_deleter(struct nvme_active_ns_ctx *ctx) +{ + struct spdk_nvme_ctrlr *ctrlr = ctx->ctrlr; + + if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) { + nvme_ctrlr_destruct_namespaces(ctrlr); + nvme_active_ns_ctx_destroy(ctx); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE); + nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list); + nvme_active_ns_ctx_destroy(ctx); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, ctrlr->opts.admin_timeout_ms); +} + +static void +_nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_active_ns_ctx *ctx; + + ctx = nvme_active_ns_ctx_create(ctrlr, _nvme_active_ns_ctx_deleter); + if (!ctx) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS, + ctrlr->opts.admin_timeout_ms); + nvme_ctrlr_identify_active_ns_async(ctx); +} + +int +nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_active_ns_ctx *ctx; + int rc; + + ctx = nvme_active_ns_ctx_create(ctrlr, NULL); + if (!ctx) { + return -ENOMEM; + } + + nvme_ctrlr_identify_active_ns_async(ctx); + while (ctx->state == NVME_ACTIVE_NS_STATE_PROCESSING) { + rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + if (rc < 0) { + ctx->state = NVME_ACTIVE_NS_STATE_ERROR; + break; + } + } + + if (ctx->state == NVME_ACTIVE_NS_STATE_ERROR) { + nvme_active_ns_ctx_destroy(ctx); + return -ENXIO; + } + + assert(ctx->state == NVME_ACTIVE_NS_STATE_DONE); + nvme_ctrlr_identify_active_ns_swap(ctrlr, &ctx->new_ns_list); + nvme_active_ns_ctx_destroy(ctx); + + return 0; +} + +static void +nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } else { + nvme_ns_set_identify_data(ns); + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, + ctrlr->opts.admin_timeout_ms); + return; + } + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + struct spdk_nvme_ns_data *nsdata; + + nsdata = &ctrlr->nsdata[ns->id - 1]; + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, + ctrlr->opts.admin_timeout_ms); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_ctrlr_identify_ns_async_done, ns); +} + +static int +nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + ns->ctrlr = ctrlr; + ns->id = nsid; + + rc = nvme_ctrlr_identify_ns_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg; + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + uint32_t nsid; + int rc; + + if (spdk_nvme_cpl_is_error(cpl)) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return; + } + + /* move on to the next active NS */ + nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } +} + +static int +nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, + ctrlr->opts.admin_timeout_ms); + return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, + 0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_ctrlr_identify_id_desc_async_done, ns); +} + +static int +nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t nsid; + struct spdk_nvme_ns *ns; + int rc; + + if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr); + ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + if (ns == NULL) { + /* No active NS, move on to the next state */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + rc = nvme_ctrlr_identify_id_desc_async(ns); + if (rc) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + } + + return rc; +} + +static void +nvme_ctrlr_update_nvmf_ioccsz(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA || + ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP || + ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_FC) { + if (ctrlr->cdata.nvmf_specific.ioccsz < 4) { + SPDK_ERRLOG("Incorrect IOCCSZ %u, the minimum value should be 4\n", + ctrlr->cdata.nvmf_specific.ioccsz); + ctrlr->cdata.nvmf_specific.ioccsz = 4; + assert(0); + } + ctrlr->ioccsz_bytes = ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd); + ctrlr->icdoff = ctrlr->cdata.nvmf_specific.icdoff; + } +} + +static void +nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t cq_allocated, sq_allocated, min_allocated, i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_ERRLOG("Set Features - Number of Queues failed!\n"); + ctrlr->opts.num_io_queues = 0; + } else { + /* + * Data in cdw0 is 0-based. + * Lower 16-bits indicate number of submission queues allocated. + * Upper 16-bits indicate number of completion queues allocated. + */ + sq_allocated = (cpl->cdw0 & 0xFFFF) + 1; + cq_allocated = (cpl->cdw0 >> 16) + 1; + + /* + * For 1:1 queue mapping, set number of allocated queues to be minimum of + * submission and completion queues. + */ + min_allocated = spdk_min(sq_allocated, cq_allocated); + + /* Set number of queues to be minimum of requested and actually allocated. */ + ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues); + } + + ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1); + if (ctrlr->free_io_qids == NULL) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + + /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */ + spdk_bit_array_clear(ctrlr->free_io_qids, 0); + for (i = 1; i <= ctrlr->opts.num_io_queues; i++) { + spdk_bit_array_set(ctrlr->free_io_qids, i); + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) { + SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n", + ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES); + ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES; + } else if (ctrlr->opts.num_io_queues < 1) { + SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n"); + ctrlr->opts.num_io_queues = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues, + nvme_ctrlr_set_num_queues_done, ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + uint32_t keep_alive_interval_ms; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + if ((cpl->status.sct == SPDK_NVME_SCT_GENERIC) && + (cpl->status.sc == SPDK_NVME_SC_INVALID_FIELD)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Keep alive timeout Get Feature is not supported\n"); + } else { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n", + cpl->status.sc, cpl->status.sct); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + } else { + if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n", + cpl->cdw0); + } + + ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0; + } + + keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2; + if (keep_alive_interval_ms == 0) { + keep_alive_interval_ms = 1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms); + + ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000); + + /* Schedule the first Keep Alive to be sent as soon as possible. */ + ctrlr->next_keep_alive_tick = spdk_get_ticks(); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->opts.keep_alive_timeout_ms == 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + if (ctrlr->cdata.kas == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n"); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, + ctrlr->opts.admin_timeout_ms); + return 0; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, + ctrlr->opts.admin_timeout_ms); + + /* Retrieve actual keep alive timeout, since the controller may have adjusted it. */ + rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0, + nvme_ctrlr_set_keep_alive_timeout_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc); + ctrlr->opts.keep_alive_timeout_ms = 0; + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + /* + * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature + * is optional. + */ + SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n", + cpl->status.sc, cpl->status.sct); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); +} + +static int +nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr) +{ + uint8_t *host_id; + uint32_t host_id_size; + int rc; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + /* + * NVMe-oF sends the host ID during Connect and doesn't allow + * Set Features - Host Identifier after Connect, so we don't need to do anything here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + if (ctrlr->cdata.ctratt.host_id_exhid_supported) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n"); + host_id = ctrlr->opts.extended_host_id; + host_id_size = sizeof(ctrlr->opts.extended_host_id); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n"); + host_id = ctrlr->opts.host_id; + host_id_size = sizeof(ctrlr->opts.host_id); + } + + /* If the user specified an all-zeroes host identifier, don't send the command. */ + if (spdk_mem_all_zero(host_id, host_id_size)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, + "User did not specify host ID - not sending Set Features - Host ID\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE); + return 0; + } + + SPDK_LOGDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size); + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr); + if (rc != 0) { + SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +static void +nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->ns) { + uint32_t i, num_ns = ctrlr->num_ns; + + for (i = 0; i < num_ns; i++) { + nvme_ns_destruct(&ctrlr->ns[i]); + } + + spdk_free(ctrlr->ns); + ctrlr->ns = NULL; + ctrlr->num_ns = 0; + } + + if (ctrlr->nsdata) { + spdk_free(ctrlr->nsdata); + ctrlr->nsdata = NULL; + } + + spdk_free(ctrlr->active_ns_list); + ctrlr->active_ns_list = NULL; +} + +static void +nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + uint32_t i, nn = ctrlr->cdata.nn; + struct spdk_nvme_ns_data *nsdata; + bool ns_is_active; + + for (i = 0; i < nn; i++) { + struct spdk_nvme_ns *ns = &ctrlr->ns[i]; + uint32_t nsid = i + 1; + + nsdata = &ctrlr->nsdata[nsid - 1]; + ns_is_active = spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid); + + if (nsdata->ncap && ns_is_active) { + if (nvme_ns_update(ns) != 0) { + SPDK_ERRLOG("Failed to update active NS %u\n", nsid); + continue; + } + } + + if ((nsdata->ncap == 0) && ns_is_active) { + if (nvme_ns_construct(ns, nsid, ctrlr) != 0) { + continue; + } + } + + if (nsdata->ncap && !ns_is_active) { + nvme_ns_destruct(ns); + } + } +} + +static int +nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc = 0; + uint32_t nn = ctrlr->cdata.nn; + + /* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset), + * so check if we need to reallocate. + */ + if (nn != ctrlr->num_ns) { + nvme_ctrlr_destruct_namespaces(ctrlr); + + if (nn == 0) { + SPDK_WARNLOG("controller has 0 namespaces\n"); + return 0; + } + + ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr->ns == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64, + NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA); + if (ctrlr->nsdata == NULL) { + rc = -ENOMEM; + goto fail; + } + + ctrlr->num_ns = nn; + } + + return 0; + +fail: + nvme_ctrlr_destruct_namespaces(ctrlr); + return rc; +} + +static void +nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer = arg; + struct spdk_nvme_ctrlr *ctrlr = aer->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + union spdk_nvme_async_event_completion event; + int rc; + + if (cpl->status.sct == SPDK_NVME_SCT_GENERIC && + cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) { + /* + * This is simulated when controller is being shut down, to + * effectively abort outstanding asynchronous event requests + * and make sure all memory is freed. Do not repost the + * request in this case. + */ + return; + } + + if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) { + /* + * SPDK will only send as many AERs as the device says it supports, + * so this status code indicates an out-of-spec device. Do not repost + * the request in this case. + */ + SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n" + "handling. Do not repost this AER.\n"); + return; + } + + event.raw = cpl->cdw0; + if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) && + (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) { + rc = nvme_ctrlr_identify_active_ns(ctrlr); + if (rc) { + return; + } + nvme_ctrlr_update_namespaces(ctrlr); + nvme_io_msg_ctrlr_update(ctrlr); + } + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc && active_proc->aer_cb_fn) { + active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl); + } + + /* If the ctrlr was removed or in the destruct state, we should not send aer again */ + if (ctrlr->is_removed || ctrlr->is_destructed) { + return; + } + + /* + * Repost another asynchronous event request to replace the one + * that just completed. + */ + if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) { + /* + * We can't do anything to recover from a failure here, + * so just print a warning message and leave the AER unsubmitted. + */ + SPDK_ERRLOG("resubmitting AER failed!\n"); + } +} + +static int +nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_async_event_request *aer) +{ + struct nvme_request *req; + + aer->ctrlr = ctrlr; + req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer); + aer->req = req; + if (req == NULL) { + return -1; + } + + req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST; + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static void +nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_async_event_request *aer; + int rc; + uint32_t i; + struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg; + + if (spdk_nvme_cpl_is_error(cpl)) { + SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + ctrlr->opts.admin_timeout_ms); + return; + } + + /* aerl is a zero-based value, so we need to add 1 here. */ + ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1)); + + for (i = 0; i < ctrlr->num_aers; i++) { + aer = &ctrlr->aer[i]; + rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); + if (rc) { + SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return; + } + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + ctrlr->opts.admin_timeout_ms); +} + +static int +nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_feat_async_event_configuration config; + int rc; + + config.raw = 0; + config.bits.crit_warn.bits.available_spare = 1; + config.bits.crit_warn.bits.temperature = 1; + config.bits.crit_warn.bits.device_reliability = 1; + config.bits.crit_warn.bits.read_only = 1; + config.bits.crit_warn.bits.volatile_memory_backup = 1; + + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) { + if (ctrlr->cdata.oaes.ns_attribute_notices) { + config.bits.ns_attr_notice = 1; + } + if (ctrlr->cdata.oaes.fw_activation_notices) { + config.bits.fw_activation_notice = 1; + } + } + if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) { + config.bits.telemetry_log_notice = 1; + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, + ctrlr->opts.admin_timeout_ms); + + rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config, + nvme_ctrlr_configure_aer_done, + ctrlr); + if (rc != 0) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE); + return rc; + } + + return 0; +} + +struct spdk_nvme_ctrlr_process * +nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + if (active_proc->pid == pid) { + return active_proc; + } + } + + return NULL; +} + +struct spdk_nvme_ctrlr_process * +nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr) +{ + return nvme_ctrlr_get_process(ctrlr, getpid()); +} + +/** + * This function will be called when a process is using the controller. + * 1. For the primary process, it is called when constructing the controller. + * 2. For the secondary process, it is called at probing the controller. + * Note: will check whether the process is already added for the same process. + */ +int +nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle) +{ + struct spdk_nvme_ctrlr_process *ctrlr_proc; + pid_t pid = getpid(); + + /* Check whether the process is already added or not */ + if (nvme_ctrlr_get_process(ctrlr, pid)) { + return 0; + } + + /* Initialize the per process properties for this ctrlr */ + ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process), + 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (ctrlr_proc == NULL) { + SPDK_ERRLOG("failed to allocate memory to track the process props\n"); + + return -1; + } + + ctrlr_proc->is_primary = spdk_process_is_primary(); + ctrlr_proc->pid = pid; + STAILQ_INIT(&ctrlr_proc->active_reqs); + ctrlr_proc->devhandle = devhandle; + ctrlr_proc->ref = 0; + TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs); + + TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq); + + return 0; +} + +/** + * This function will be called when the process detaches the controller. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_ctrlr_process *proc) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + assert(STAILQ_EMPTY(&proc->active_reqs)); + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq); + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + spdk_pci_device_detach(proc->devhandle); + } + + spdk_free(proc); +} + +/** + * This function will be called when the process exited unexpectedly + * in order to free any incomplete nvme request, allocated IO qpairs + * and allocated memory. + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc) +{ + struct nvme_request *req, *tmp_req; + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == proc->pid); + + nvme_free_request(req); + } + + TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) { + TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq); + + /* + * The process may have been killed while some qpairs were in their + * completion context. Clear that flag here to allow these IO + * qpairs to be deleted. + */ + qpair->in_completion_context = 0; + + qpair->no_deletion_notification_needed = 1; + + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + spdk_free(proc); +} + +/** + * This function will be called when destructing the controller. + * 1. There is no more admin request on this controller. + * 2. Clean up any left resource allocation when its associated process is gone. + */ +void +nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + + /* Free all the processes' properties and make sure no pending admin IOs */ + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + assert(STAILQ_EMPTY(&active_proc->active_reqs)); + + spdk_free(active_proc); + } +} + +/** + * This function will be called when any other process attaches or + * detaches the controller in order to cleanup those unexpectedly + * terminated processes. + * Note: the ctrlr_lock must be held when calling this function. + */ +static int +nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc, *tmp; + int active_proc_count = 0; + + TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) { + if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) { + SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid); + + TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq); + + nvme_ctrlr_cleanup_process(active_proc); + } else { + active_proc_count++; + } + } + + return active_proc_count; +} + +void +nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref++; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int proc_count; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->ref--; + assert(active_proc->ref >= 0); + + /* + * The last active process will be removed at the end of + * the destruction of the controller. + */ + if (active_proc->ref == 0 && proc_count != 1) { + nvme_ctrlr_remove_process(ctrlr, active_proc); + } + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +int +nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + int ref = 0; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + nvme_ctrlr_remove_inactive_proc(ctrlr); + + TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) { + ref += active_proc->ref; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return ref; +} + +/** + * Get the PCI device handle which is only visible to its associated process. + */ +struct spdk_pci_device * +nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_ctrlr_process *active_proc; + struct spdk_pci_device *devhandle = NULL; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + devhandle = active_proc->devhandle; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return devhandle; +} + +/** + * This function will be called repeatedly during initialization until the controller is ready. + */ +int +nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + uint32_t ready_timeout_in_ms; + int rc = 0; + + /* + * May need to avoid accessing any register on the target controller + * for a while. Return early without touching the FSM. + * Check sleep_timeout_tsc > 0 for unit test. + */ + if ((ctrlr->sleep_timeout_tsc > 0) && + (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) { + return 0; + } + ctrlr->sleep_timeout_tsc = 0; + + if (nvme_ctrlr_get_cc(ctrlr, &cc) || + nvme_ctrlr_get_csts(ctrlr, &csts)) { + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) { + /* While a device is resetting, it may be unable to service MMIO reads + * temporarily. Allow for this case. + */ + SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n"); + goto init_timeout; + } + SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state); + return -EIO; + } + + ready_timeout_in_ms = 500 * ctrlr->cap.bits.to; + + /* + * Check if the current initialization step is done or has timed out. + */ + switch (ctrlr->state) { + case NVME_CTRLR_STATE_INIT_DELAY: + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms); + if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_INIT) { + /* + * Controller may need some delay before it's enabled. + * + * This is a workaround for an issue where the PCIe-attached NVMe controller + * is not ready after VFIO reset. We delay the initialization rather than the + * enabling itself, because this is required only for the very first enabling + * - directly after a VFIO reset. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000); + } + break; + + case NVME_CTRLR_STATE_INIT: + /* Begin the hardware initialization by making sure the controller is disabled. */ + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n"); + /* + * Controller is currently enabled. We need to disable it to cause a reset. + * + * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready. + * Wait for the ready bit to be 1 before disabling the controller. + */ + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return 0; + } + + /* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + + /* + * Wait 2.5 seconds before accessing PCI registers. + * Not using sleep() to avoid blocking other controller's initialization. + */ + if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n"); + ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000); + } + return 0; + } else { + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n"); + } + + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n"); + /* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n"); + cc.bits.en = 0; + if (nvme_ctrlr_set_cc(ctrlr, &cc)) { + SPDK_ERRLOG("set_cc() failed\n"); + return -EIO; + } + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0: + if (csts.bits.rdy == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n"); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms); + /* + * Delay 100us before setting CC.EN = 1. Some NVMe SSDs miss CC.EN getting + * set to 1 if it is too soon after CSTS.RDY is reported as 0. + */ + spdk_delay_us(100); + return 0; + } + break; + + case NVME_CTRLR_STATE_ENABLE: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n"); + rc = nvme_ctrlr_enable(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms); + return rc; + + case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1: + if (csts.bits.rdy == 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n"); + /* + * The controller has been enabled. + * Perform the rest of initialization serially. + */ + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_RESET_ADMIN_QUEUE, + ctrlr->opts.admin_timeout_ms); + return 0; + } + break; + + case NVME_CTRLR_STATE_RESET_ADMIN_QUEUE: + nvme_transport_qpair_reset(ctrlr->adminq); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_IDENTIFY: + rc = nvme_ctrlr_identify(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_NUM_QUEUES: + nvme_ctrlr_update_nvmf_ioccsz(ctrlr); + rc = nvme_ctrlr_set_num_queues(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONSTRUCT_NS: + rc = nvme_ctrlr_construct_namespaces(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS: + _nvme_ctrlr_identify_active_ns(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_IDENTIFY_NS: + rc = nvme_ctrlr_identify_namespaces(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS: + rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_CONFIGURE_AER: + rc = nvme_ctrlr_configure_aer(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES: + rc = nvme_ctrlr_set_supported_log_pages(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES: + nvme_ctrlr_set_supported_features(ctrlr); + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG, + ctrlr->opts.admin_timeout_ms); + break; + + case NVME_CTRLR_STATE_SET_DB_BUF_CFG: + rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT: + rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_SET_HOST_ID: + rc = nvme_ctrlr_set_host_id(ctrlr); + break; + + case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID: + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + break; + + case NVME_CTRLR_STATE_READY: + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n"); + return 0; + + case NVME_CTRLR_STATE_ERROR: + SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr); + return -1; + + default: + assert(0); + return -1; + } + +init_timeout: + if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE && + spdk_get_ticks() > ctrlr->state_timeout_tsc) { + SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state); + return -1; + } + + return rc; +} + +int +nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx) +{ + pthread_mutexattr_t attr; + int rc = 0; + + if (pthread_mutexattr_init(&attr)) { + return -1; + } + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) || +#ifndef __FreeBSD__ + pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) || + pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) || +#endif + pthread_mutex_init(mtx, &attr)) { + rc = -1; + } + pthread_mutexattr_destroy(&attr); + return rc; +} + +int +nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE); + } else { + nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE); + } + + if (ctrlr->opts.admin_queue_size > SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES) { + SPDK_ERRLOG("admin_queue_size %u exceeds max defined by NVMe spec, use max value\n", + ctrlr->opts.admin_queue_size); + ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MAX_ENTRIES; + } + + if (ctrlr->opts.admin_queue_size < SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES) { + SPDK_ERRLOG("admin_queue_size %u is less than minimum defined by NVMe spec, use min value\n", + ctrlr->opts.admin_queue_size); + ctrlr->opts.admin_queue_size = SPDK_NVME_ADMIN_QUEUE_MIN_ENTRIES; + } + + ctrlr->flags = 0; + ctrlr->free_io_qids = NULL; + ctrlr->is_resetting = false; + ctrlr->is_failed = false; + ctrlr->is_destructed = false; + + TAILQ_INIT(&ctrlr->active_io_qpairs); + STAILQ_INIT(&ctrlr->queued_aborts); + ctrlr->outstanding_aborts = 0; + + rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock); + if (rc != 0) { + return rc; + } + + TAILQ_INIT(&ctrlr->active_procs); + + return rc; +} + +/* This function should be called once at ctrlr initialization to set up constant properties. */ +void +nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs) +{ + ctrlr->cap = *cap; + ctrlr->vs = *vs; + + if (ctrlr->cap.bits.ams & SPDK_NVME_CAP_AMS_WRR) { + ctrlr->flags |= SPDK_NVME_CTRLR_WRR_SUPPORTED; + } + + ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin); + + /* For now, always select page_size == min_page_size. */ + ctrlr->page_size = ctrlr->min_page_size; + + ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES); + ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u); + + ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size); +} + +void +nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr) +{ + pthread_mutex_destroy(&ctrlr->ctrlr_lock); +} + +void +nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_nvme_qpair *qpair, *tmp; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr); + + ctrlr->is_destructed = true; + + spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + + nvme_ctrlr_abort_queued_aborts(ctrlr); + nvme_transport_admin_qpair_abort_aers(ctrlr->adminq); + + TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + } + + nvme_ctrlr_free_doorbell_buffer(ctrlr); + + if (ctrlr->opts.no_shn_notification) { + SPDK_INFOLOG(SPDK_LOG_NVME, "Disable SSD: %s without shutdown notification\n", + ctrlr->trid.traddr); + nvme_ctrlr_disable(ctrlr); + } else { + nvme_ctrlr_shutdown(ctrlr); + } + + nvme_ctrlr_destruct_namespaces(ctrlr); + + spdk_bit_array_free(&ctrlr->free_io_qids); + + nvme_transport_ctrlr_destruct(ctrlr); +} + +int +nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + return nvme_qpair_submit_request(ctrlr->adminq, req); +} + +static void +nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl) +{ + /* Do nothing */ +} + +/* + * Check if we need to send a Keep Alive command. + * Caller must hold ctrlr->ctrlr_lock. + */ +static void +nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr) +{ + uint64_t now; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + now = spdk_get_ticks(); + if (now < ctrlr->next_keep_alive_tick) { + return; + } + + req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL); + if (req == NULL) { + return; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + if (rc != 0) { + SPDK_ERRLOG("Submitting Keep Alive failed\n"); + } + + ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks; +} + +int32_t +spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr) +{ + int32_t num_completions; + int32_t rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (ctrlr->keep_alive_interval_ticks) { + nvme_ctrlr_keep_alive(ctrlr); + } + + rc = nvme_io_msg_process(ctrlr); + if (rc < 0) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; + } + num_completions = rc; + + rc = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (rc < 0) { + num_completions = rc; + } else { + num_completions += rc; + } + + return num_completions; +} + +const struct spdk_nvme_ctrlr_data * +spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr) +{ + return &ctrlr->cdata; +} + +union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_csts_register csts; + + if (nvme_ctrlr_get_csts(ctrlr, &csts)) { + csts.raw = 0xFFFFFFFFu; + } + return csts; +} + +union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->cap; +} + +union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->vs; +} + +union spdk_nvme_cmbsz_register spdk_nvme_ctrlr_get_regs_cmbsz(struct spdk_nvme_ctrlr *ctrlr) +{ + union spdk_nvme_cmbsz_register cmbsz; + + if (nvme_ctrlr_get_cmbsz(ctrlr, &cmbsz)) { + cmbsz.raw = 0; + } + + return cmbsz; +} + +uint32_t +spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->num_ns; +} + +static int32_t +nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + int32_t result = -1; + + if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) { + return result; + } + + int32_t lower = 0; + int32_t upper = ctrlr->num_ns - 1; + int32_t mid; + + while (lower <= upper) { + mid = lower + (upper - lower) / 2; + if (ctrlr->active_ns_list[mid] == nsid) { + result = mid; + break; + } else { + if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) { + lower = mid + 1; + } else { + upper = mid - 1; + } + + } + } + + return result; +} + +bool +spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + return nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1; +} + +uint32_t +spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0; +} + +uint32_t +spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid) +{ + int32_t nsid_idx = nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid); + if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) { + return ctrlr->active_ns_list[nsid_idx + 1]; + } + return 0; +} + +struct spdk_nvme_ns * +spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + if (nsid < 1 || nsid > ctrlr->num_ns) { + return NULL; + } + + return &ctrlr->ns[nsid - 1]; +} + +struct spdk_pci_device * +spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr == NULL) { + return NULL; + } + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + return NULL; + } + + return nvme_ctrlr_proc_get_devhandle(ctrlr); +} + +uint32_t +spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->max_xfer_size; +} + +void +spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_aer_cb aer_cb_fn, + void *aer_cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->aer_cb_fn = aer_cb_fn; + active_proc->aer_cb_arg = aer_cb_arg; + } + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +void +spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr, + uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg) +{ + struct spdk_nvme_ctrlr_process *active_proc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + active_proc = nvme_ctrlr_get_current_process(ctrlr); + if (active_proc) { + active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL; + active_proc->timeout_cb_fn = cb_fn; + active_proc->timeout_cb_arg = cb_arg; + } + + ctrlr->timeout_enabled = true; + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +bool +spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page) +{ + /* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch"); + return ctrlr->log_page_supported[log_page]; +} + +bool +spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code) +{ + /* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */ + SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch"); + return ctrlr->feature_supported[feature_code]; +} + +int +spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + return nvme_ns_construct(ns, nsid, ctrlr); +} + +int +spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + /* Inactive NS */ + nvme_ns_destruct(ns); + + return 0; +} + +uint32_t +spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload) +{ + struct nvme_completion_poll_status *status; + int res; + uint32_t nsid; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return 0; + } + + res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, status); + if (res) { + free(status); + return 0; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return 0; + } + + nsid = status->cpl.cdw0; + ns = &ctrlr->ns[nsid - 1]; + free(status); + /* Inactive NS */ + res = nvme_ns_construct(ns, nsid, ctrlr); + if (res) { + return 0; + } + + /* Return the namespace ID that was created */ + return nsid; +} + +int +spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + struct nvme_completion_poll_status *status; + int res; + struct spdk_nvme_ns *ns; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + res = nvme_ctrlr_identify_active_ns(ctrlr); + if (res) { + return res; + } + + ns = &ctrlr->ns[nsid - 1]; + nvme_ns_destruct(ns); + + return 0; +} + +int +spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +int +spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size, + int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status) +{ + struct spdk_nvme_fw_commit fw_commit; + struct nvme_completion_poll_status *status; + int res; + unsigned int size_remaining; + unsigned int offset; + unsigned int transfer; + void *p; + + if (!completion_status) { + return -EINVAL; + } + memset(completion_status, 0, sizeof(struct spdk_nvme_status)); + if (size % 4) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n"); + return -1; + } + + /* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG + * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG + */ + if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) && + (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n"); + return -1; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + /* Firmware download */ + size_remaining = size; + offset = 0; + p = payload; + + while (size_remaining > 0) { + transfer = spdk_min(size_remaining, ctrlr->min_page_size); + + memset(status, 0, sizeof(*status)); + res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p, + nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + p += transfer; + offset += transfer; + size_remaining -= transfer; + } + + /* Firmware commit */ + memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit)); + fw_commit.fs = slot; + fw_commit.ca = commit_action; + + memset(status, 0, sizeof(*status)); + res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + + res = nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock); + + memcpy(completion_status, &status->cpl.status, sizeof(struct spdk_nvme_status)); + + if (!status->timed_out) { + free(status); + } + + if (res) { + if (completion_status->sct != SPDK_NVME_SCT_COMMAND_SPECIFIC || + completion_status->sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) { + if (completion_status->sct == SPDK_NVME_SCT_COMMAND_SPECIFIC && + completion_status->sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) { + SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n"); + } else { + SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n"); + } + return -ENXIO; + } + } + + return spdk_nvme_ctrlr_reset(ctrlr); +} + +int +spdk_nvme_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc, size; + union spdk_nvme_cmbsz_register cmbsz; + + cmbsz = spdk_nvme_ctrlr_get_regs_cmbsz(ctrlr); + + if (cmbsz.bits.rds == 0 || cmbsz.bits.wds == 0) { + return -ENOTSUP; + } + + size = cmbsz.bits.sz * (0x1000 << (cmbsz.bits.szu * 4)); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + rc = nvme_transport_ctrlr_reserve_cmb(ctrlr); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + if (rc < 0) { + return rc; + } + + return size; +} + +void * +spdk_nvme_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + void *buf; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + buf = nvme_transport_ctrlr_map_cmb(ctrlr, size); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return buf; +} + +void +spdk_nvme_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + nvme_transport_ctrlr_unmap_cmb(ctrlr); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); +} + +bool +spdk_nvme_ctrlr_is_discovery(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr); + + return !strncmp(ctrlr->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN, + strlen(SPDK_NVMF_DISCOVERY_NQN)); +} + +int +spdk_nvme_ctrlr_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, size_t size) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = spdk_nvme_ctrlr_cmd_security_receive(ctrlr, secp, spsp, nssf, payload, size, + nvme_completion_poll_cb, status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_receive failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + free(status); + + return 0; +} + +int +spdk_nvme_ctrlr_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, size_t size) +{ + struct nvme_completion_poll_status *status; + int res; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + res = spdk_nvme_ctrlr_cmd_security_send(ctrlr, secp, spsp, nssf, payload, size, + nvme_completion_poll_cb, + status); + if (res) { + free(status); + return res; + } + if (nvme_wait_for_completion_robust_lock(ctrlr->adminq, status, &ctrlr->ctrlr_lock)) { + SPDK_ERRLOG("spdk_nvme_ctrlr_cmd_security_send failed!\n"); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + + free(status); + + return 0; +} + +uint64_t +spdk_nvme_ctrlr_get_flags(struct spdk_nvme_ctrlr *ctrlr) +{ + return ctrlr->flags; +} + +const struct spdk_nvme_transport_id * +spdk_nvme_ctrlr_get_transport_id(struct spdk_nvme_ctrlr *ctrlr) +{ + return &ctrlr->trid; +} + +/* FIXME need to specify max number of iovs */ +int +spdk_nvme_map_prps(void *prv, struct spdk_nvme_cmd *cmd, struct iovec *iovs, + uint32_t len, size_t mps, + void *(*gpa_to_vva)(void *prv, uint64_t addr, uint64_t len)) +{ + uint64_t prp1, prp2; + void *vva; + uint32_t i; + uint32_t residue_len, nents; + uint64_t *prp_list; + int iovcnt; + + prp1 = cmd->dptr.prp.prp1; + prp2 = cmd->dptr.prp.prp2; + + /* PRP1 may started with unaligned page address */ + residue_len = mps - (prp1 % mps); + residue_len = spdk_min(len, residue_len); + + vva = gpa_to_vva(prv, prp1, residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("GPA to VVA failed\n"); + return -1; + } + iovs[0].iov_base = vva; + iovs[0].iov_len = residue_len; + len -= residue_len; + + if (len) { + if (spdk_unlikely(prp2 == 0)) { + SPDK_ERRLOG("no PRP2, %d remaining\n", len); + return -1; + } + + if (len <= mps) { + /* 2 PRP used */ + iovcnt = 2; + vva = gpa_to_vva(prv, prp2, len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, len%#x\n", + prp2, len); + return -1; + } + iovs[1].iov_base = vva; + iovs[1].iov_len = len; + } else { + /* PRP list used */ + nents = (len + mps - 1) / mps; + vva = gpa_to_vva(prv, prp2, nents * sizeof(*prp_list)); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, nents=%#x\n", + prp2, nents); + return -1; + } + prp_list = vva; + i = 0; + while (len != 0) { + residue_len = spdk_min(len, mps); + vva = gpa_to_vva(prv, prp_list[i], residue_len); + if (spdk_unlikely(vva == NULL)) { + SPDK_ERRLOG("no VVA for %#lx, residue_len=%#x\n", + prp_list[i], residue_len); + return -1; + } + iovs[i + 1].iov_base = vva; + iovs[i + 1].iov_len = residue_len; + len -= residue_len; + i++; + } + iovcnt = i + 1; + } + } else { + /* 1 PRP used */ + iovcnt = 1; + } + + return iovcnt; +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c new file mode 100644 index 000000000..9b16c8d6f --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c @@ -0,0 +1,966 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +int +spdk_nvme_ctrlr_io_cmd_raw_no_payload_build(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + return -EINVAL; + } + + memset(&payload, 0, sizeof(payload)); + req = nvme_allocate_request(qpair, &payload, 0, 0, cb_fn, cb_arg); + + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg); + + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, void *md_buf, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct nvme_payload payload; + uint32_t md_len = 0; + + payload = NVME_PAYLOAD_CONTIG(buf, md_buf); + + /* Caculate metadata length */ + if (md_buf) { + struct spdk_nvme_ns *ns = &ctrlr->ns[cmd->nsid - 1]; + + assert(ns->sector_size != 0); + md_len = len / ns->sector_size * ns->md_size; + } + + req = nvme_allocate_request(qpair, &payload, len, md_len, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + void *buf, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + memcpy(&req->cmd, cmd, sizeof(req->cmd)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_IDENTIFY; + cmd->cdw10_bits.identify.cns = cns; + cmd->cdw10_bits.identify.cntid = cntid; + cmd->nsid = nsid; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +int +nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_ATTACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ctrlr_list), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT; + cmd->nsid = nsid; + cmd->cdw10_bits.ns_attach.sel = SPDK_NVME_NS_CTRLR_DETACH; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, sizeof(struct spdk_nvme_ns_data), + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_CREATE; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT; + cmd->cdw10_bits.ns_manage.sel = SPDK_NVME_NS_MANAGEMENT_DELETE; + cmd->nsid = nsid; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG; + cmd->dptr.prp.prp1 = prp1; + cmd->dptr.prp.prp2 = prp2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FORMAT_NVM; + cmd->nsid = nsid; + memcpy(&cmd->cdw10, format, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10_bits.set_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10_bits.get_features.fid = feature; + cmd->cdw11 = cdw11; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_FEATURES; + cmd->cdw10_bits.get_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature, + uint32_t cdw11, uint32_t cdw12, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t ns_id) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg, + true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SET_FEATURES; + cmd->cdw10_bits.set_features.fid = feature; + cmd->cdw11 = cdw11; + cmd->cdw12 = cdw12; + cmd->nsid = ns_id; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + union spdk_nvme_feat_number_of_queues feat_num_queues; + + feat_num_queues.raw = 0; + feat_num_queues.bits.nsqr = num_queues - 1; + feat_num_queues.bits.ncqr = num_queues - 1; + + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, feat_num_queues.raw, + 0, + NULL, 0, cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + uint32_t cdw11; + + cdw11 = config.raw; + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0, + NULL, 0, + cb_fn, cb_arg); +} + +int +nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + union spdk_nvme_feat_host_identifier feat_host_identifier; + + feat_host_identifier.raw = 0; + if (host_id_size == 16) { + /* 128-bit extended host identifier */ + feat_host_identifier.bits.exhid = 1; + } else if (host_id_size == 8) { + /* 64-bit host identifier */ + feat_host_identifier.bits.exhid = 0; + } else { + SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size); + return -EINVAL; + } + + return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER, + feat_host_identifier.raw, 0, + host_id, host_id_size, cb_fn, cb_arg); +} + +int +spdk_nvme_ctrlr_cmd_get_log_page_ext(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page, + uint32_t nsid, void *payload, uint32_t payload_size, + uint64_t offset, uint32_t cdw10, + uint32_t cdw11, uint32_t cdw14, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint32_t numd, numdl, numdu; + uint32_t lpol, lpou; + int rc; + + if (payload_size == 0) { + return -EINVAL; + } + + if (offset & 3) { + return -EINVAL; + } + + numd = payload_size / sizeof(uint32_t) - 1u; + numdl = numd & 0xFFFFu; + numdu = (numd >> 16) & 0xFFFFu; + + lpol = (uint32_t)offset; + lpou = (uint32_t)(offset >> 32); + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + + if (offset && !ctrlr->cdata.lpa.edlp) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE; + cmd->nsid = nsid; + cmd->cdw10 = cdw10; + cmd->cdw10_bits.get_log_page.numdl = numdl; + cmd->cdw10_bits.get_log_page.lid = log_page; + + cmd->cdw11 = cdw11; + cmd->cdw11_bits.get_log_page.numdu = numdu; + cmd->cdw12 = lpol; + cmd->cdw13 = lpou; + cmd->cdw14 = cdw14; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page, + uint32_t nsid, void *payload, uint32_t payload_size, + uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return spdk_nvme_ctrlr_cmd_get_log_page_ext(ctrlr, log_page, nsid, payload, + payload_size, offset, 0, 0, 0, cb_fn, cb_arg); +} + +static void +nvme_ctrlr_retry_queued_abort(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_request *next, *tmp; + int rc; + + if (ctrlr->is_resetting || ctrlr->is_destructed) { + return; + } + + STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) { + STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq); + ctrlr->outstanding_aborts++; + rc = nvme_ctrlr_submit_admin_request(ctrlr, next); + if (rc < 0) { + SPDK_ERRLOG("Failed to submit queued abort.\n"); + memset(&next->cpl, 0, sizeof(next->cpl)); + next->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + next->cpl.status.dnr = 1; + nvme_complete_request(next->cb_fn, next->cb_arg, next->qpair, next, &next->cpl); + nvme_free_request(next); + } else { + /* If the first abort succeeds, stop iterating. */ + break; + } + } +} + +static int +_nvme_ctrlr_submit_abort_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req) +{ + /* ACL is a 0's based value. */ + if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl + 1U) { + STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq); + return 0; + } else { + ctrlr->outstanding_aborts++; + return nvme_ctrlr_submit_admin_request(ctrlr, req); + } +} + +static void +nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); + + req->user_cb_fn(req->user_cb_arg, cpl); +} + +int +spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_cmd_abort_cpl, NULL); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + req->cb_arg = req; + req->user_cb_fn = cb_fn; + req->user_cb_arg = cb_arg; + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_ABORT; + cmd->cdw10_bits.abort.sqid = qpair->id; + cmd->cdw10_bits.abort.cid = cid; + + rc = _nvme_ctrlr_submit_abort_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +static void +nvme_complete_abort_request(void *ctx, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *req = ctx; + struct nvme_request *parent = req->parent; + struct spdk_nvme_ctrlr *ctrlr; + + ctrlr = req->qpair->ctrlr; + + ctrlr->outstanding_aborts--; + nvme_ctrlr_retry_queued_abort(ctrlr); + + nvme_request_remove_child(parent, req); + + if (!spdk_nvme_cpl_is_abort_success(cpl)) { + parent->parent_status.cdw0 |= 1U; + } + + if (parent->num_children == 0) { + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static int +nvme_request_add_abort(struct nvme_request *req, void *arg) +{ + struct nvme_request *parent = arg; + struct nvme_request *child; + void *cmd_cb_arg; + + cmd_cb_arg = parent->user_cb_arg; + + if (req->cb_arg != cmd_cb_arg && + (req->parent == NULL || req->parent->cb_arg != cmd_cb_arg)) { + return 0; + } + + child = nvme_allocate_request_null(parent->qpair->ctrlr->adminq, + nvme_complete_abort_request, NULL); + if (child == NULL) { + return -ENOMEM; + } + + child->cb_arg = child; + + child->cmd.opc = SPDK_NVME_OPC_ABORT; + /* Copy SQID from the parent. */ + child->cmd.cdw10_bits.abort.sqid = parent->cmd.cdw10_bits.abort.sqid; + child->cmd.cdw10_bits.abort.cid = req->cmd.cid; + + child->parent = parent; + + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + parent->num_children++; + + return 0; +} + +int +spdk_nvme_ctrlr_cmd_abort_ext(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + void *cmd_cb_arg, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + int rc = 0; + struct nvme_request *parent, *child, *tmp; + bool child_failed = false; + int aborted = 0; + + if (cmd_cb_arg == NULL) { + return -EINVAL; + } + + pthread_mutex_lock(&ctrlr->ctrlr_lock); + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + parent = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (parent == NULL) { + pthread_mutex_unlock(&ctrlr->ctrlr_lock); + + return -ENOMEM; + } + + TAILQ_INIT(&parent->children); + parent->num_children = 0; + + parent->cmd.opc = SPDK_NVME_OPC_ABORT; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + + /* Hold SQID that the requests to abort are associated with. + * This will be copied to the children. + * + * CID is not set here because the parent is not submitted directly + * and CID is not determined until request to abort is found. + */ + parent->cmd.cdw10_bits.abort.sqid = qpair->id; + + /* This is used to find request to abort. */ + parent->user_cb_arg = cmd_cb_arg; + + /* Add an abort request for each outstanding request which has cmd_cb_arg + * as its callback context. + */ + rc = nvme_transport_qpair_iterate_requests(qpair, nvme_request_add_abort, parent); + if (rc != 0) { + /* Free abort requests already added. */ + child_failed = true; + } + + TAILQ_FOREACH_SAFE(child, &parent->children, child_tailq, tmp) { + if (spdk_likely(!child_failed)) { + rc = _nvme_ctrlr_submit_abort_request(ctrlr, child); + if (spdk_unlikely(rc != 0)) { + child_failed = true; + } + } else { + /* Free remaining abort requests. */ + nvme_request_remove_child(parent, child); + nvme_free_request(child); + } + } + + if (spdk_likely(!child_failed)) { + /* There is no error so far. Abort requests were submitted successfully + * or there was no outstanding request to abort. + * + * Hence abort queued requests which has cmd_cb_arg as its callback + * context next. + */ + aborted = nvme_qpair_abort_queued_reqs(qpair, cmd_cb_arg); + if (parent->num_children == 0) { + /* There was no outstanding request to abort. */ + if (aborted > 0) { + /* The queued requests were successfully aborted. Hence + * complete the parent request with success synchronously. + */ + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } else { + /* There was no queued request to abort. */ + rc = -ENOENT; + } + } + } else { + /* Failed to add or submit abort request. */ + if (parent->num_children != 0) { + /* Return success since we must wait for those children + * to complete but set the parent request to failure. + */ + parent->parent_status.cdw0 |= 1U; + rc = 0; + } + } + + if (rc != 0) { + nvme_free_request(parent); + } + + pthread_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} + +int +nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT; + memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; + +} + +int +nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD; + cmd->cdw10 = (size >> 2) - 1; + cmd->cdw11 = offset >> 2; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE; + cmd->cdw10_bits.sec_send_recv.nssf = nssf; + cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp; + cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8); + cmd->cdw10_bits.sec_send_recv.secp = secp; + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp, + uint16_t spsp, uint8_t nssf, void *payload, + uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, + cb_fn, cb_arg, true); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SECURITY_SEND; + cmd->cdw10_bits.sec_send_recv.nssf = nssf; + cmd->cdw10_bits.sec_send_recv.spsp0 = (uint8_t)spsp; + cmd->cdw10_bits.sec_send_recv.spsp1 = (uint8_t)(spsp >> 8); + cmd->cdw10_bits.sec_send_recv.secp = secp; + cmd->cdw11 = payload_size; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} + +int +nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_sanitize *sanitize, uint32_t cdw11, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_SANITIZE; + cmd->nsid = nsid; + cmd->cdw11 = cdw11; + memcpy(&cmd->cdw10, sanitize, sizeof(cmd->cdw10)); + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + + return rc; +} diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c new file mode 100644 index 000000000..2eba219ce --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c @@ -0,0 +1,88 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +bool +spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr) +{ + if (ctrlr->quirks & NVME_QUIRK_OCSSD) { + /* TODO: There isn't a standardized way to identify Open-Channel SSD + * different verdors may have different conditions. + */ + + /* + * Current QEMU OpenChannel Device needs to check nsdata->vs[0]. + * Here check nsdata->vs[0] of the first namespace. + */ + if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) { + if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) { + return true; + } + } + } + return false; +} + + +int +spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + void *payload, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + int rc; + + if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) { + return -EINVAL; + } + + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + req = nvme_allocate_request_user_copy(ctrlr->adminq, + payload, payload_size, cb_fn, cb_arg, false); + if (req == NULL) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_GEOMETRY; + cmd->nsid = nsid; + + rc = nvme_ctrlr_submit_admin_request(ctrlr, req); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + return rc; +} diff --git a/src/spdk/lib/nvme/nvme_cuse.c b/src/spdk/lib/nvme/nvme_cuse.c new file mode 100644 index 000000000..9a5ee1f0d --- /dev/null +++ b/src/spdk/lib/nvme/nvme_cuse.c @@ -0,0 +1,1115 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#define FUSE_USE_VERSION 31 + +#include <fuse3/cuse_lowlevel.h> + +#include <linux/nvme_ioctl.h> +#include <linux/fs.h> + +#include "nvme_internal.h" +#include "nvme_io_msg.h" +#include "nvme_cuse.h" + +struct cuse_device { + bool is_started; + + char dev_name[128]; + uint32_t index; + int claim_fd; + char lock_name[64]; + + struct spdk_nvme_ctrlr *ctrlr; /**< NVMe controller */ + uint32_t nsid; /**< NVMe name space id, or 0 */ + + pthread_t tid; + struct fuse_session *session; + + struct cuse_device *ctrlr_device; + struct cuse_device *ns_devices; /**< Array of cuse ns devices */ + + TAILQ_ENTRY(cuse_device) tailq; +}; + +static pthread_mutex_t g_cuse_mtx = PTHREAD_MUTEX_INITIALIZER; +static TAILQ_HEAD(, cuse_device) g_ctrlr_ctx_head = TAILQ_HEAD_INITIALIZER(g_ctrlr_ctx_head); +static struct spdk_bit_array *g_ctrlr_started; + +struct cuse_io_ctx { + struct spdk_nvme_cmd nvme_cmd; + enum spdk_nvme_data_transfer data_transfer; + + uint64_t lba; + uint32_t lba_count; + + void *data; + int data_len; + + fuse_req_t req; +}; + +static void +cuse_io_ctx_free(struct cuse_io_ctx *ctx) +{ + spdk_free(ctx->data); + free(ctx); +} + +#define FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, val) \ + if (out_bufsz == 0) { \ + struct iovec out_iov; \ + out_iov.iov_base = (void *)arg; \ + out_iov.iov_len = sizeof(val); \ + fuse_reply_ioctl_retry(req, NULL, 0, &out_iov, 1); \ + return; \ + } + +static void +cuse_nvme_admin_cmd_cb(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = arg; + struct iovec out_iov[2]; + struct spdk_nvme_cpl _cpl; + + if (ctx->data_transfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0); + } else { + memcpy(&_cpl, cpl, sizeof(struct spdk_nvme_cpl)); + + out_iov[0].iov_base = &_cpl.cdw0; + out_iov[0].iov_len = sizeof(_cpl.cdw0); + + if (ctx->data_len > 0) { + out_iov[1].iov_base = ctx->data; + out_iov[1].iov_len = ctx->data_len; + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 2); + } else { + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, out_iov, 1); + } + } + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_admin_cmd_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, &ctx->nvme_cmd, ctx->data, ctx->data_len, + cuse_nvme_admin_cmd_cb, (void *)ctx); + if (rc < 0) { + fuse_reply_err(ctx->req, EINVAL); + cuse_io_ctx_free(ctx); + } +} + +static void +cuse_nvme_admin_cmd_send(fuse_req_t req, struct nvme_admin_cmd *admin_cmd, + const void *data) +{ + struct cuse_io_ctx *ctx; + struct cuse_device *cuse_device = fuse_req_userdata(req); + int rv; + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for cuse_io_ctx\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + ctx->data_transfer = spdk_nvme_opc_get_data_transfer(admin_cmd->opcode); + + memset(&ctx->nvme_cmd, 0, sizeof(ctx->nvme_cmd)); + ctx->nvme_cmd.opc = admin_cmd->opcode; + ctx->nvme_cmd.nsid = admin_cmd->nsid; + ctx->nvme_cmd.cdw10 = admin_cmd->cdw10; + ctx->nvme_cmd.cdw11 = admin_cmd->cdw11; + ctx->nvme_cmd.cdw12 = admin_cmd->cdw12; + ctx->nvme_cmd.cdw13 = admin_cmd->cdw13; + ctx->nvme_cmd.cdw14 = admin_cmd->cdw14; + ctx->nvme_cmd.cdw15 = admin_cmd->cdw15; + + ctx->data_len = admin_cmd->data_len; + + if (ctx->data_len > 0) { + ctx->data = spdk_malloc(ctx->data_len, 0, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!ctx->data) { + SPDK_ERRLOG("Cannot allocate memory for data\n"); + fuse_reply_err(req, ENOMEM); + free(ctx); + return; + } + if (data != NULL) { + memcpy(ctx->data, data, ctx->data_len); + } + } + + rv = nvme_io_msg_send(cuse_device->ctrlr, 0, cuse_nvme_admin_cmd_execute, ctx); + if (rv) { + SPDK_ERRLOG("Cannot send io msg to the controller\n"); + fuse_reply_err(req, -rv); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_admin_cmd(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct nvme_admin_cmd *admin_cmd; + struct iovec in_iov[2], out_iov[2]; + + in_iov[0].iov_base = (void *)arg; + in_iov[0].iov_len = sizeof(*admin_cmd); + if (in_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0); + return; + } + + admin_cmd = (struct nvme_admin_cmd *)in_buf; + + switch (spdk_nvme_opc_get_data_transfer(admin_cmd->opcode)) { + case SPDK_NVME_DATA_NONE: + SPDK_ERRLOG("SPDK_NVME_DATA_NONE not implemented\n"); + fuse_reply_err(req, EINVAL); + return; + case SPDK_NVME_DATA_HOST_TO_CONTROLLER: + if (admin_cmd->addr != 0) { + in_iov[1].iov_base = (void *)admin_cmd->addr; + in_iov[1].iov_len = admin_cmd->data_len; + if (in_bufsz == sizeof(*admin_cmd)) { + fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0); + return; + } + cuse_nvme_admin_cmd_send(req, admin_cmd, in_buf + sizeof(*admin_cmd)); + } else { + cuse_nvme_admin_cmd_send(req, admin_cmd, NULL); + } + return; + case SPDK_NVME_DATA_CONTROLLER_TO_HOST: + if (out_bufsz == 0) { + out_iov[0].iov_base = &((struct nvme_admin_cmd *)arg)->result; + out_iov[0].iov_len = sizeof(uint32_t); + if (admin_cmd->data_len > 0) { + out_iov[1].iov_base = (void *)admin_cmd->addr; + out_iov[1].iov_len = admin_cmd->data_len; + fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 2); + } else { + fuse_reply_ioctl_retry(req, in_iov, 1, out_iov, 1); + } + return; + } + + cuse_nvme_admin_cmd_send(req, admin_cmd, NULL); + + return; + case SPDK_NVME_DATA_BIDIRECTIONAL: + fuse_reply_err(req, EINVAL); + return; + } +} + +static void +cuse_nvme_reset_execute(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + fuse_req_t req = arg; + + rc = spdk_nvme_ctrlr_reset(ctrlr); + if (rc) { + fuse_reply_err(req, rc); + return; + } + + fuse_reply_ioctl_iov(req, 0, NULL, 0); +} + +static void +cuse_nvme_reset(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int rv; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + if (cuse_device->nsid) { + SPDK_ERRLOG("Namespace reset not supported\n"); + fuse_reply_err(req, EINVAL); + return; + } + + rv = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_reset_execute, (void *)req); + if (rv) { + SPDK_ERRLOG("Cannot send reset\n"); + fuse_reply_err(req, EINVAL); + } +} + +/***************************************************************************** + * Namespace IO requests + */ + +static void +cuse_nvme_submit_io_write_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref; + + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, NULL, 0); + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_submit_io_write_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + + rc = spdk_nvme_ns_cmd_write(ns, ctrlr->external_io_msgs_qpair, ctx->data, + ctx->lba, /* LBA start */ + ctx->lba_count, /* number of LBAs */ + cuse_nvme_submit_io_write_done, ctx, 0); + + if (rc != 0) { + SPDK_ERRLOG("write failed: rc = %d\n", rc); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_submit_io_write(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + const struct nvme_user_io *user_io = in_buf; + struct cuse_io_ctx *ctx; + struct spdk_nvme_ns *ns; + uint32_t block_size; + int rc; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for context\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + block_size = spdk_nvme_ns_get_sector_size(ns); + + ctx->lba = user_io->slba; + ctx->lba_count = user_io->nblocks + 1; + ctx->data_len = ctx->lba_count * block_size; + + ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (ctx->data == NULL) { + SPDK_ERRLOG("Write buffer allocation failed\n"); + fuse_reply_err(ctx->req, ENOMEM); + free(ctx); + return; + } + + memcpy(ctx->data, in_buf + sizeof(*user_io), ctx->data_len); + + rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_write_cb, + ctx); + if (rc < 0) { + SPDK_ERRLOG("Cannot send write io\n"); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + } +} + +static void +cuse_nvme_submit_io_read_done(void *ref, const struct spdk_nvme_cpl *cpl) +{ + struct cuse_io_ctx *ctx = (struct cuse_io_ctx *)ref; + struct iovec iov; + + iov.iov_base = ctx->data; + iov.iov_len = ctx->data_len; + + fuse_reply_ioctl_iov(ctx->req, cpl->status.sc, &iov, 1); + + cuse_io_ctx_free(ctx); +} + +static void +cuse_nvme_submit_io_read_cb(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, void *arg) +{ + int rc; + struct cuse_io_ctx *ctx = arg; + struct spdk_nvme_ns *ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid); + + rc = spdk_nvme_ns_cmd_read(ns, ctrlr->external_io_msgs_qpair, ctx->data, + ctx->lba, /* LBA start */ + ctx->lba_count, /* number of LBAs */ + cuse_nvme_submit_io_read_done, ctx, 0); + + if (rc != 0) { + SPDK_ERRLOG("read failed: rc = %d\n", rc); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + return; + } +} + +static void +cuse_nvme_submit_io_read(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int rc; + struct cuse_io_ctx *ctx; + const struct nvme_user_io *user_io = in_buf; + struct cuse_device *cuse_device = fuse_req_userdata(req); + struct spdk_nvme_ns *ns; + uint32_t block_size; + + ctx = (struct cuse_io_ctx *)calloc(1, sizeof(struct cuse_io_ctx)); + if (!ctx) { + SPDK_ERRLOG("Cannot allocate memory for context\n"); + fuse_reply_err(req, ENOMEM); + return; + } + + ctx->req = req; + ctx->lba = user_io->slba; + ctx->lba_count = user_io->nblocks; + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + block_size = spdk_nvme_ns_get_sector_size(ns); + + ctx->data_len = ctx->lba_count * block_size; + ctx->data = spdk_zmalloc(ctx->data_len, 0x1000, NULL, SPDK_ENV_SOCKET_ID_ANY, + SPDK_MALLOC_DMA); + if (ctx->data == NULL) { + SPDK_ERRLOG("Read buffer allocation failed\n"); + fuse_reply_err(ctx->req, ENOMEM); + free(ctx); + return; + } + + rc = nvme_io_msg_send(cuse_device->ctrlr, cuse_device->nsid, cuse_nvme_submit_io_read_cb, ctx); + if (rc < 0) { + SPDK_ERRLOG("Cannot send read io\n"); + fuse_reply_err(ctx->req, rc); + cuse_io_ctx_free(ctx); + } +} + + +static void +cuse_nvme_submit_io(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + const struct nvme_user_io *user_io; + struct iovec in_iov[2], out_iov; + + in_iov[0].iov_base = (void *)arg; + in_iov[0].iov_len = sizeof(*user_io); + if (in_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, NULL, 0); + return; + } + + user_io = in_buf; + + switch (user_io->opcode) { + case SPDK_NVME_OPC_READ: + out_iov.iov_base = (void *)user_io->addr; + out_iov.iov_len = (user_io->nblocks + 1) * 512; + if (out_bufsz == 0) { + fuse_reply_ioctl_retry(req, in_iov, 1, &out_iov, 1); + return; + } + + cuse_nvme_submit_io_read(req, cmd, arg, fi, flags, in_buf, + in_bufsz, out_bufsz); + break; + case SPDK_NVME_OPC_WRITE: + in_iov[1].iov_base = (void *)user_io->addr; + in_iov[1].iov_len = (user_io->nblocks + 1) * 512; + if (in_bufsz == sizeof(*user_io)) { + fuse_reply_ioctl_retry(req, in_iov, 2, NULL, 0); + return; + } + + cuse_nvme_submit_io_write(req, cmd, arg, fi, flags, in_buf, + in_bufsz, out_bufsz); + + break; + default: + SPDK_ERRLOG("SUBMIT_IO: opc:%d not valid\n", user_io->opcode); + fuse_reply_err(req, EINVAL); + return; + } + +} + +/***************************************************************************** + * Other namespace IOCTLs + */ +static void +cuse_blkgetsize64(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + uint64_t size; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + size = spdk_nvme_ns_get_num_sectors(ns); + fuse_reply_ioctl(req, 0, &size, sizeof(size)); +} + +static void +cuse_blkpbszget(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + int pbsz; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, pbsz); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + pbsz = spdk_nvme_ns_get_sector_size(ns); + fuse_reply_ioctl(req, 0, &pbsz, sizeof(pbsz)); +} + +static void +cuse_blkgetsize(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + long size; + struct spdk_nvme_ns *ns; + struct cuse_device *cuse_device = fuse_req_userdata(req); + + FUSE_REPLY_CHECK_BUFFER(req, arg, out_bufsz, size); + + ns = spdk_nvme_ctrlr_get_ns(cuse_device->ctrlr, cuse_device->nsid); + + /* return size in 512 bytes blocks */ + size = spdk_nvme_ns_get_num_sectors(ns) * 512 / spdk_nvme_ns_get_sector_size(ns); + fuse_reply_ioctl(req, 0, &size, sizeof(size)); +} + +static void +cuse_getid(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + struct cuse_device *cuse_device = fuse_req_userdata(req); + + fuse_reply_ioctl(req, cuse_device->nsid, NULL, 0); +} + +static void +cuse_ctrlr_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_RESET: + cuse_nvme_reset(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + default: + SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd); + fuse_reply_err(req, EINVAL); + } +} + +static void +cuse_ns_ioctl(fuse_req_t req, int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, + const void *in_buf, size_t in_bufsz, size_t out_bufsz) +{ + if (flags & FUSE_IOCTL_COMPAT) { + fuse_reply_err(req, ENOSYS); + return; + } + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + cuse_nvme_admin_cmd(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_SUBMIT_IO: + cuse_nvme_submit_io(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case NVME_IOCTL_ID: + cuse_getid(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKPBSZGET: + cuse_blkpbszget(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKGETSIZE: + /* Returns the device size as a number of 512-byte blocks (returns pointer to long) */ + cuse_blkgetsize(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + case BLKGETSIZE64: + /* Returns the device size in sectors (returns pointer to uint64_t) */ + cuse_blkgetsize64(req, cmd, arg, fi, flags, in_buf, in_bufsz, out_bufsz); + break; + + default: + SPDK_ERRLOG("Unsupported IOCTL 0x%X.\n", cmd); + fuse_reply_err(req, EINVAL); + } +} + +/***************************************************************************** + * CUSE threads initialization. + */ + +static void cuse_open(fuse_req_t req, struct fuse_file_info *fi) +{ + fuse_reply_open(req, fi); +} + +static const struct cuse_lowlevel_ops cuse_ctrlr_clop = { + .open = cuse_open, + .ioctl = cuse_ctrlr_ioctl, +}; + +static const struct cuse_lowlevel_ops cuse_ns_clop = { + .open = cuse_open, + .ioctl = cuse_ns_ioctl, +}; + +static void * +cuse_thread(void *arg) +{ + struct cuse_device *cuse_device = arg; + char *cuse_argv[] = { "cuse", "-f" }; + int cuse_argc = SPDK_COUNTOF(cuse_argv); + char devname_arg[128 + 8]; + const char *dev_info_argv[] = { devname_arg }; + struct cuse_info ci; + int multithreaded; + int rc; + struct fuse_buf buf = { .mem = NULL }; + struct pollfd fds; + int timeout_msecs = 500; + + spdk_unaffinitize_thread(); + + snprintf(devname_arg, sizeof(devname_arg), "DEVNAME=%s", cuse_device->dev_name); + + memset(&ci, 0, sizeof(ci)); + ci.dev_info_argc = 1; + ci.dev_info_argv = dev_info_argv; + ci.flags = CUSE_UNRESTRICTED_IOCTL; + + if (cuse_device->nsid) { + cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ns_clop, + &multithreaded, cuse_device); + } else { + cuse_device->session = cuse_lowlevel_setup(cuse_argc, cuse_argv, &ci, &cuse_ctrlr_clop, + &multithreaded, cuse_device); + } + if (!cuse_device->session) { + SPDK_ERRLOG("Cannot create cuse session\n"); + goto err; + } + + SPDK_NOTICELOG("fuse session for device %s created\n", cuse_device->dev_name); + + /* Receive and process fuse requests */ + fds.fd = fuse_session_fd(cuse_device->session); + fds.events = POLLIN; + while (!fuse_session_exited(cuse_device->session)) { + rc = poll(&fds, 1, timeout_msecs); + if (rc <= 0) { + continue; + } + rc = fuse_session_receive_buf(cuse_device->session, &buf); + if (rc > 0) { + fuse_session_process_buf(cuse_device->session, &buf); + } + } + free(buf.mem); + fuse_session_reset(cuse_device->session); + cuse_lowlevel_teardown(cuse_device->session); +err: + pthread_exit(NULL); +} + +/***************************************************************************** + * CUSE devices management + */ + +static int +cuse_nvme_ns_start(struct cuse_device *ctrlr_device, uint32_t nsid) +{ + struct cuse_device *ns_device; + int rv; + + ns_device = &ctrlr_device->ns_devices[nsid - 1]; + if (ns_device->is_started) { + return 0; + } + + ns_device->ctrlr = ctrlr_device->ctrlr; + ns_device->ctrlr_device = ctrlr_device; + ns_device->nsid = nsid; + rv = snprintf(ns_device->dev_name, sizeof(ns_device->dev_name), "%sn%d", + ctrlr_device->dev_name, ns_device->nsid); + if (rv < 0) { + SPDK_ERRLOG("Device name too long.\n"); + free(ns_device); + return -ENAMETOOLONG; + } + + rv = pthread_create(&ns_device->tid, NULL, cuse_thread, ns_device); + if (rv != 0) { + SPDK_ERRLOG("pthread_create failed\n"); + return -rv; + } + + ns_device->is_started = true; + + return 0; +} + +static void +cuse_nvme_ns_stop(struct cuse_device *ctrlr_device, uint32_t nsid) +{ + struct cuse_device *ns_device; + + ns_device = &ctrlr_device->ns_devices[nsid - 1]; + if (!ns_device->is_started) { + return; + } + + fuse_session_exit(ns_device->session); + pthread_join(ns_device->tid, NULL); + ns_device->is_started = false; +} + +static int +nvme_cuse_claim(struct cuse_device *ctrlr_device, uint32_t index) +{ + int dev_fd; + int pid; + void *dev_map; + struct flock cusedev_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + }; + + snprintf(ctrlr_device->lock_name, sizeof(ctrlr_device->lock_name), + "/tmp/spdk_nvme_cuse_lock_%" PRIu32, index); + + dev_fd = open(ctrlr_device->lock_name, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (dev_fd == -1) { + SPDK_ERRLOG("could not open %s\n", ctrlr_device->lock_name); + return -errno; + } + + if (ftruncate(dev_fd, sizeof(int)) != 0) { + SPDK_ERRLOG("could not truncate %s\n", ctrlr_device->lock_name); + close(dev_fd); + return -errno; + } + + dev_map = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, + MAP_SHARED, dev_fd, 0); + if (dev_map == MAP_FAILED) { + SPDK_ERRLOG("could not mmap dev %s (%d)\n", ctrlr_device->lock_name, errno); + close(dev_fd); + return -errno; + } + + if (fcntl(dev_fd, F_SETLK, &cusedev_lock) != 0) { + pid = *(int *)dev_map; + SPDK_ERRLOG("Cannot create lock on device %s, probably" + " process %d has claimed it\n", ctrlr_device->lock_name, pid); + munmap(dev_map, sizeof(int)); + close(dev_fd); + /* F_SETLK returns unspecified errnos, normalize them */ + return -EACCES; + } + + *(int *)dev_map = (int)getpid(); + munmap(dev_map, sizeof(int)); + ctrlr_device->claim_fd = dev_fd; + ctrlr_device->index = index; + /* Keep dev_fd open to maintain the lock. */ + return 0; +} + +static void +nvme_cuse_unclaim(struct cuse_device *ctrlr_device) +{ + close(ctrlr_device->claim_fd); + ctrlr_device->claim_fd = -1; + unlink(ctrlr_device->lock_name); +} + +static void +cuse_nvme_ctrlr_stop(struct cuse_device *ctrlr_device) +{ + uint32_t i; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr); + + for (i = 1; i <= num_ns; i++) { + cuse_nvme_ns_stop(ctrlr_device, i); + } + + fuse_session_exit(ctrlr_device->session); + pthread_join(ctrlr_device->tid, NULL); + TAILQ_REMOVE(&g_ctrlr_ctx_head, ctrlr_device, tailq); + spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index); + if (spdk_bit_array_count_set(g_ctrlr_started) == 0) { + spdk_bit_array_free(&g_ctrlr_started); + } + nvme_cuse_unclaim(ctrlr_device); + free(ctrlr_device->ns_devices); + free(ctrlr_device); +} + +static int +cuse_nvme_ctrlr_update_namespaces(struct cuse_device *ctrlr_device) +{ + uint32_t nsid; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr_device->ctrlr); + + for (nsid = 1; nsid <= num_ns; nsid++) { + if (!spdk_nvme_ctrlr_is_active_ns(ctrlr_device->ctrlr, nsid)) { + cuse_nvme_ns_stop(ctrlr_device, nsid); + continue; + } + + if (cuse_nvme_ns_start(ctrlr_device, nsid) < 0) { + SPDK_ERRLOG("Cannot start CUSE namespace device."); + return -1; + } + } + + return 0; +} + +static int +nvme_cuse_start(struct spdk_nvme_ctrlr *ctrlr) +{ + int rv = 0; + struct cuse_device *ctrlr_device; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + + SPDK_NOTICELOG("Creating cuse device for controller\n"); + + if (g_ctrlr_started == NULL) { + g_ctrlr_started = spdk_bit_array_create(128); + if (g_ctrlr_started == NULL) { + SPDK_ERRLOG("Cannot create bit array\n"); + return -ENOMEM; + } + } + + ctrlr_device = (struct cuse_device *)calloc(1, sizeof(struct cuse_device)); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot allocate memory for ctrlr_device."); + rv = -ENOMEM; + goto err2; + } + + ctrlr_device->ctrlr = ctrlr; + + /* Check if device already exists, if not increment index until success */ + ctrlr_device->index = 0; + while (1) { + ctrlr_device->index = spdk_bit_array_find_first_clear(g_ctrlr_started, ctrlr_device->index); + if (ctrlr_device->index == UINT32_MAX) { + SPDK_ERRLOG("Too many registered controllers\n"); + goto err2; + } + + if (nvme_cuse_claim(ctrlr_device, ctrlr_device->index) == 0) { + break; + } + ctrlr_device->index++; + } + spdk_bit_array_set(g_ctrlr_started, ctrlr_device->index); + snprintf(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name), "spdk/nvme%d", + ctrlr_device->index); + + rv = pthread_create(&ctrlr_device->tid, NULL, cuse_thread, ctrlr_device); + if (rv != 0) { + SPDK_ERRLOG("pthread_create failed\n"); + rv = -rv; + goto err3; + } + TAILQ_INSERT_TAIL(&g_ctrlr_ctx_head, ctrlr_device, tailq); + + ctrlr_device->ns_devices = (struct cuse_device *)calloc(num_ns, sizeof(struct cuse_device)); + /* Start all active namespaces */ + if (cuse_nvme_ctrlr_update_namespaces(ctrlr_device) < 0) { + SPDK_ERRLOG("Cannot start CUSE namespace devices."); + cuse_nvme_ctrlr_stop(ctrlr_device); + rv = -1; + goto err3; + } + + return 0; + +err3: + spdk_bit_array_clear(g_ctrlr_started, ctrlr_device->index); +err2: + free(ctrlr_device); + if (spdk_bit_array_count_set(g_ctrlr_started) == 0) { + spdk_bit_array_free(&g_ctrlr_started); + } + return rv; +} + +static struct cuse_device * +nvme_cuse_get_cuse_ctrlr_device(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device = NULL; + + TAILQ_FOREACH(ctrlr_device, &g_ctrlr_ctx_head, tailq) { + if (ctrlr_device->ctrlr == ctrlr) { + break; + } + } + + return ctrlr_device; +} + +static struct cuse_device * +nvme_cuse_get_cuse_ns_device(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid) +{ + struct cuse_device *ctrlr_device = NULL; + uint32_t num_ns = spdk_nvme_ctrlr_get_num_ns(ctrlr); + + if (nsid < 1 || nsid > num_ns) { + return NULL; + } + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + return NULL; + } + + if (!ctrlr_device->ns_devices[nsid - 1].is_started) { + return NULL; + } + + return &ctrlr_device->ns_devices[nsid - 1]; +} + +static void +nvme_cuse_stop(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot find associated CUSE device\n"); + pthread_mutex_unlock(&g_cuse_mtx); + return; + } + + cuse_nvme_ctrlr_stop(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); +} + +static void +nvme_cuse_update(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return; + } + + cuse_nvme_ctrlr_update_namespaces(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); +} + +static struct nvme_io_msg_producer cuse_nvme_io_msg_producer = { + .name = "cuse", + .stop = nvme_cuse_stop, + .update = nvme_cuse_update, +}; + +int +spdk_nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + rc = nvme_io_msg_ctrlr_register(ctrlr, &cuse_nvme_io_msg_producer); + if (rc) { + return rc; + } + + pthread_mutex_lock(&g_cuse_mtx); + + rc = nvme_cuse_start(ctrlr); + if (rc) { + nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer); + } + + pthread_mutex_unlock(&g_cuse_mtx); + + return rc; +} + +int +spdk_nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr) +{ + struct cuse_device *ctrlr_device; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + SPDK_ERRLOG("Cannot find associated CUSE device\n"); + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + cuse_nvme_ctrlr_stop(ctrlr_device); + + pthread_mutex_unlock(&g_cuse_mtx); + + nvme_io_msg_ctrlr_unregister(ctrlr, &cuse_nvme_io_msg_producer); + + return 0; +} + +void +spdk_nvme_cuse_update_namespaces(struct spdk_nvme_ctrlr *ctrlr) +{ + nvme_cuse_update(ctrlr); +} + +int +spdk_nvme_cuse_get_ctrlr_name(struct spdk_nvme_ctrlr *ctrlr, char *name, size_t *size) +{ + struct cuse_device *ctrlr_device; + size_t req_len; + + pthread_mutex_lock(&g_cuse_mtx); + + ctrlr_device = nvme_cuse_get_cuse_ctrlr_device(ctrlr); + if (!ctrlr_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + req_len = strnlen(ctrlr_device->dev_name, sizeof(ctrlr_device->dev_name)); + if (*size < req_len) { + *size = req_len; + pthread_mutex_unlock(&g_cuse_mtx); + return -ENOSPC; + } + snprintf(name, req_len + 1, "%s", ctrlr_device->dev_name); + + pthread_mutex_unlock(&g_cuse_mtx); + + return 0; +} + +int +spdk_nvme_cuse_get_ns_name(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, char *name, size_t *size) +{ + struct cuse_device *ns_device; + size_t req_len; + + pthread_mutex_lock(&g_cuse_mtx); + + ns_device = nvme_cuse_get_cuse_ns_device(ctrlr, nsid); + if (!ns_device) { + pthread_mutex_unlock(&g_cuse_mtx); + return -ENODEV; + } + + req_len = strnlen(ns_device->dev_name, sizeof(ns_device->dev_name)); + if (*size < req_len) { + *size = req_len; + pthread_mutex_unlock(&g_cuse_mtx); + return -ENOSPC; + } + snprintf(name, req_len + 1, "%s", ns_device->dev_name); + + pthread_mutex_unlock(&g_cuse_mtx); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_cuse.h b/src/spdk/lib/nvme/nvme_cuse.h new file mode 100644 index 000000000..92b475190 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_cuse.h @@ -0,0 +1,42 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVME_CUSE_H__ +#define __NVME_CUSE_H__ + +#include "spdk/nvme.h" + +int nvme_cuse_register(struct spdk_nvme_ctrlr *ctrlr, const char *dev_path); +void nvme_cuse_unregister(struct spdk_nvme_ctrlr *ctrlr); + +#endif /* __NVME_CUSE_H__ */ diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c new file mode 100644 index 000000000..9fff20873 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_fabric.c @@ -0,0 +1,475 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics transport-independent functions + */ + +#include "nvme_internal.h" + +#include "spdk/endian.h" +#include "spdk/string.h" + +static int +nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status *status; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET; + cmd.ofst = offset; + cmd.attrib.size = size; + cmd.value.u64 = value; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + SPDK_ERRLOG("Property Set failed\n"); + return -1; + } + free(status); + + return 0; +} + +static int +nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr, + uint32_t offset, uint8_t size, uint64_t *value) +{ + struct spdk_nvmf_fabric_prop_set_cmd cmd = {}; + struct nvme_completion_poll_status *status; + struct spdk_nvmf_fabric_prop_get_rsp *response; + int rc; + + assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8); + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET; + cmd.ofst = offset; + cmd.attrib.size = size; + + rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd, + NULL, 0, nvme_completion_poll_cb, + status); + if (rc < 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + SPDK_ERRLOG("Property Get failed\n"); + return -1; + } + + response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status->cpl; + + if (size == SPDK_NVMF_PROP_SIZE_4) { + *value = response->value.u32.low; + } else { + *value = response->value.u64; + } + + free(status); + + return 0; +} + +int +nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value); +} + +int +nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +int +nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + uint64_t tmp_value; + int rc; + rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value); + + if (!rc) { + *value = (uint32_t)tmp_value; + } + return rc; +} + +int +nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value); +} + +static void +nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry, + struct spdk_nvme_probe_ctx *probe_ctx, + int discover_priority) +{ + struct spdk_nvme_transport_id trid; + uint8_t *end; + size_t len; + + memset(&trid, 0, sizeof(trid)); + + if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_WARNLOG("Skipping unsupported discovery service referral\n"); + return; + } else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) { + SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype); + return; + } + + trid.trtype = entry->trtype; + spdk_nvme_transport_id_populate_trstring(&trid, spdk_nvme_transport_id_trtype_str(entry->trtype)); + if (!spdk_nvme_transport_available_by_name(trid.trstring)) { + SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n", + trid.trtype); + return; + } + + snprintf(trid.trstring, sizeof(trid.trstring), "%s", probe_ctx->trid.trstring); + trid.adrfam = entry->adrfam; + + /* Ensure that subnqn is null terminated. */ + end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n"); + return; + } + len = end - entry->subnqn; + memcpy(trid.subnqn, entry->subnqn, len); + trid.subnqn[len] = '\0'; + + /* Convert traddr to a null terminated string. */ + len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' '); + memcpy(trid.traddr, entry->traddr, len); + if (spdk_str_chomp(trid.traddr) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n"); + } + + /* Convert trsvcid to a null terminated string. */ + len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' '); + memcpy(trid.trsvcid, entry->trsvcid, len); + if (spdk_str_chomp(trid.trsvcid) != 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n"); + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n", + trid.subnqn, trid.trtype, + trid.traddr, trid.trsvcid); + + /* Copy the priority from the discovery ctrlr */ + trid.priority = discover_priority; + + nvme_ctrlr_probe(&trid, probe_ctx, NULL); +} + +static int +nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr, + void *log_page, uint32_t size, uint64_t offset) +{ + struct nvme_completion_poll_status *status; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset, + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return -1; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + free(status); + + return 0; +} + +int +nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + struct spdk_nvme_ctrlr_opts discovery_opts; + struct spdk_nvme_ctrlr *discovery_ctrlr; + union spdk_nvme_cc_register cc; + int rc; + struct nvme_completion_poll_status *status; + + if (strcmp(probe_ctx->trid.subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) { + /* It is not a discovery_ctrlr info and try to directly connect it */ + rc = nvme_ctrlr_probe(&probe_ctx->trid, probe_ctx, NULL); + return rc; + } + + spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts)); + /* For discovery_ctrlr set the timeout to 0 */ + discovery_opts.keep_alive_timeout_ms = 0; + + discovery_ctrlr = nvme_transport_ctrlr_construct(&probe_ctx->trid, &discovery_opts, NULL); + if (discovery_ctrlr == NULL) { + return -1; + } + nvme_qpair_set_state(discovery_ctrlr->adminq, NVME_QPAIR_ENABLED); + + /* TODO: this should be using the normal NVMe controller initialization process +1 */ + cc.raw = 0; + cc.bits.en = 1; + cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */ + cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */ + rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw), + cc.raw); + if (rc < 0) { + SPDK_ERRLOG("Failed to set cc\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + return -1; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + return -ENOMEM; + } + + /* get the cdata info */ + rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0, + &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata), + nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to identify cdata\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + free(status); + return rc; + } + + if (nvme_wait_for_completion(discovery_ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_identify_controller failed!\n"); + nvme_ctrlr_destruct(discovery_ctrlr); + if (!status->timed_out) { + free(status); + } + return -ENXIO; + } + + free(status); + + /* Direct attach through spdk_nvme_connect() API */ + if (direct_connect == true) { + /* Set the ready state to skip the normal init process */ + discovery_ctrlr->state = NVME_CTRLR_STATE_READY; + nvme_ctrlr_connected(probe_ctx, discovery_ctrlr); + nvme_ctrlr_add_process(discovery_ctrlr, 0); + return 0; + } + + rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, probe_ctx); + nvme_ctrlr_destruct(discovery_ctrlr); + return rc; +} + +int +nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx) +{ + struct spdk_nvmf_discovery_log_page *log_page; + struct spdk_nvmf_discovery_log_page_entry *log_page_entry; + char buffer[4096]; + int rc; + uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0; + uint64_t remaining_num_rec = 0; + uint16_t recfmt; + + memset(buffer, 0x0, 4096); + buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page, + entries[0])) / + sizeof(struct spdk_nvmf_discovery_log_page_entry); + buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry); + do { + rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n"); + return rc; + } + + if (!remaining_num_rec) { + log_page = (struct spdk_nvmf_discovery_log_page *)buffer; + recfmt = from_le16(&log_page->recfmt); + if (recfmt != 0) { + SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt); + return -EPROTO; + } + remaining_num_rec = log_page->numrec; + log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]); + log_page_entry = &log_page->entries[0]; + numrec = spdk_min(remaining_num_rec, buffer_max_entries_first); + } else { + numrec = spdk_min(remaining_num_rec, buffer_max_entries); + log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer; + } + + for (i = 0; i < numrec; i++) { + nvme_fabric_discover_probe(log_page_entry++, probe_ctx, ctrlr->trid.priority); + } + remaining_num_rec -= numrec; + log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry); + } while (remaining_num_rec != 0); + + return 0; +} + +int +nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries) +{ + struct nvme_completion_poll_status *status; + struct spdk_nvmf_fabric_connect_rsp *rsp; + struct spdk_nvmf_fabric_connect_cmd cmd; + struct spdk_nvmf_fabric_connect_data *nvmf_data; + struct spdk_nvme_ctrlr *ctrlr; + int rc; + + if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) { + return -EINVAL; + } + + ctrlr = qpair->ctrlr; + if (!ctrlr) { + return -EINVAL; + } + + nvmf_data = spdk_zmalloc(sizeof(*nvmf_data), 0, NULL, + SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!nvmf_data) { + SPDK_ERRLOG("nvmf_data allocation error\n"); + return -ENOMEM; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + spdk_free(nvmf_data); + return -ENOMEM; + } + + memset(&cmd, 0, sizeof(cmd)); + cmd.opcode = SPDK_NVME_OPC_FABRIC; + cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT; + cmd.qid = qpair->id; + cmd.sqsize = num_entries - 1; + cmd.kato = ctrlr->opts.keep_alive_timeout_ms; + + if (nvme_qpair_is_admin_queue(qpair)) { + nvmf_data->cntlid = 0xFFFF; + } else { + nvmf_data->cntlid = ctrlr->cntlid; + } + + SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id), + "host ID size mismatch"); + memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid)); + snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn); + snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn); + + rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair, + (struct spdk_nvme_cmd *)&cmd, + nvmf_data, sizeof(*nvmf_data), + nvme_completion_poll_cb, status); + if (rc < 0) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_free(nvmf_data); + free(status); + return rc; + } + + if (nvme_wait_for_completion(qpair, status)) { + SPDK_ERRLOG("Connect command failed\n"); + spdk_free(nvmf_data); + if (!status->timed_out) { + free(status); + } + return -EIO; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status->cpl; + ctrlr->cntlid = rsp->status_code_specific.success.cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid); + } + + spdk_free(nvmf_data); + free(status); + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h new file mode 100644 index 000000000..98fec279d --- /dev/null +++ b/src/spdk/lib/nvme/nvme_internal.h @@ -0,0 +1,1233 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVME_INTERNAL_H__ +#define __NVME_INTERNAL_H__ + +#include "spdk/config.h" +#include "spdk/likely.h" +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" + +#if defined(__i386__) || defined(__x86_64__) +#include <x86intrin.h> +#endif + +#include "spdk/queue.h" +#include "spdk/barrier.h" +#include "spdk/bit_array.h" +#include "spdk/mmio.h" +#include "spdk/pci_ids.h" +#include "spdk/util.h" +#include "spdk/memory.h" +#include "spdk/nvme_intel.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +extern pid_t g_spdk_nvme_pid; + +/* + * Some Intel devices support vendor-unique read latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_READ_LATENCY 0x1 + +/* + * Some Intel devices support vendor-unique write latency log page even + * though the log page directory says otherwise. + */ +#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2 + +/* + * The controller needs a delay before starts checking the device + * readiness, which is done by reading the NVME_CSTS_RDY bit. + */ +#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4 + +/* + * The controller performs best when I/O is split on particular + * LBA boundaries. + */ +#define NVME_INTEL_QUIRK_STRIPING 0x8 + +/* + * The controller needs a delay after allocating an I/O queue pair + * before it is ready to accept I/O commands. + */ +#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10 + +/* + * Earlier NVMe devices do not indicate whether unmapped blocks + * will read all zeroes or not. This define indicates that the + * device does in fact read all zeroes after an unmap event + */ +#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20 + +/* + * The controller doesn't handle Identify value others than 0 or 1 correctly. + */ +#define NVME_QUIRK_IDENTIFY_CNS 0x40 + +/* + * The controller supports Open Channel command set if matching additional + * condition, like the first byte (value 0x1) in the vendor specific + * bits of the namespace identify structure is set. + */ +#define NVME_QUIRK_OCSSD 0x80 + +/* + * The controller has an Intel vendor ID but does not support Intel vendor-specific + * log pages. This is primarily for QEMU emulated SSDs which report an Intel vendor + * ID but do not support these log pages. + */ +#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100 + +/* + * The controller does not set SHST_COMPLETE in a reasonable amount of time. This + * is primarily seen in virtual VMWare NVMe SSDs. This quirk merely adds an additional + * error message that on VMWare NVMe SSDs, the shutdown timeout may be expected. + */ +#define NVME_QUIRK_SHST_COMPLETE 0x200 + +/* + * The controller requires an extra delay before starting the initialization process + * during attach. + */ +#define NVME_QUIRK_DELAY_BEFORE_INIT 0x400 + +/* + * Some SSDs exhibit poor performance with the default SPDK NVMe IO queue size. + * This quirk will increase the default to 1024 which matches other operating + * systems, at the cost of some extra memory usage. Users can still override + * the increased default by changing the spdk_nvme_io_qpair_opts when allocating + * a new queue pair. + */ +#define NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE 0x800 + +/** + * The maximum access width to PCI memory space is 8 Bytes, don't use AVX2 or + * SSE instructions to optimize the memory access(memcpy or memset) larger than + * 8 Bytes. + */ +#define NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH 0x1000 + +/** + * The SSD does not support OPAL even through it sets the security bit in OACS. + */ +#define NVME_QUIRK_OACS_SECURITY 0x2000 + +#define NVME_MAX_ASYNC_EVENTS (8) + +#define NVME_MAX_ADMIN_TIMEOUT_IN_SECS (30) + +/* Maximum log page size to fetch for AERs. */ +#define NVME_MAX_AER_LOG_SIZE (4096) + +/* + * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this + * define specifies the maximum number of queues this driver will actually + * try to configure, if available. + */ +#define DEFAULT_MAX_IO_QUEUES (1024) +#define DEFAULT_ADMIN_QUEUE_SIZE (32) +#define DEFAULT_IO_QUEUE_SIZE (256) +#define DEFAULT_IO_QUEUE_SIZE_FOR_QUIRK (1024) /* Matches Linux kernel driver */ + +#define DEFAULT_IO_QUEUE_REQUESTS (512) + +#define SPDK_NVME_DEFAULT_RETRY_COUNT (4) + +#define SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED (0) +#define SPDK_NVME_DEFAULT_TRANSPORT_ACK_TIMEOUT SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED + +#define MIN_KEEP_ALIVE_TIMEOUT_IN_MS (10000) + +/* We want to fit submission and completion rings each in a single 2MB + * hugepage to ensure physical address contiguity. + */ +#define MAX_IO_QUEUE_ENTRIES (VALUE_2MB / spdk_max( \ + sizeof(struct spdk_nvme_cmd), \ + sizeof(struct spdk_nvme_cpl))) + +enum nvme_payload_type { + NVME_PAYLOAD_TYPE_INVALID = 0, + + /** nvme_request::u.payload.contig_buffer is valid for this request */ + NVME_PAYLOAD_TYPE_CONTIG, + + /** nvme_request::u.sgl is valid for this request */ + NVME_PAYLOAD_TYPE_SGL, +}; + +/** + * Descriptor for a request data payload. + */ +struct nvme_payload { + /** + * Functions for retrieving physical addresses for scattered payloads. + */ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn; + + /** + * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the + * virtual memory address of a single virtually contiguous buffer. + * + * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the + * cb_arg that will be passed to the SGL callback functions. + */ + void *contig_or_cb_arg; + + /** Virtual memory address of a single virtually contiguous metadata buffer */ + void *md; +}; + +#define NVME_PAYLOAD_CONTIG(contig_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = NULL, \ + .next_sge_fn = NULL, \ + .contig_or_cb_arg = (contig_), \ + .md = (md_), \ + } + +#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \ + (struct nvme_payload) { \ + .reset_sgl_fn = (reset_sgl_fn_), \ + .next_sge_fn = (next_sge_fn_), \ + .contig_or_cb_arg = (cb_arg_), \ + .md = (md_), \ + } + +static inline enum nvme_payload_type +nvme_payload_type(const struct nvme_payload *payload) { + return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG; +} + +struct nvme_error_cmd { + bool do_not_submit; + uint64_t timeout_tsc; + uint32_t err_count; + uint8_t opc; + struct spdk_nvme_status status; + TAILQ_ENTRY(nvme_error_cmd) link; +}; + +struct nvme_request { + struct spdk_nvme_cmd cmd; + + uint8_t retries; + + uint8_t timed_out : 1; + + /** + * True if the request is in the queued_req list. + */ + uint8_t queued : 1; + uint8_t reserved : 6; + + /** + * Number of children requests still outstanding for this + * request which was split into multiple child requests. + */ + uint16_t num_children; + + /** + * Offset in bytes from the beginning of payload for this request. + * This is used for I/O commands that are split into multiple requests. + */ + uint32_t payload_offset; + uint32_t md_offset; + + uint32_t payload_size; + + /** + * Timeout ticks for error injection requests, can be extended in future + * to support per-request timeout feature. + */ + uint64_t timeout_tsc; + + /** + * Data payload for this request's command. + */ + struct nvme_payload payload; + + spdk_nvme_cmd_cb cb_fn; + void *cb_arg; + STAILQ_ENTRY(nvme_request) stailq; + + struct spdk_nvme_qpair *qpair; + + /* + * The value of spdk_get_ticks() when the request was submitted to the hardware. + * Only set if ctrlr->timeout_enabled is true. + */ + uint64_t submit_tick; + + /** + * The active admin request can be moved to a per process pending + * list based on the saved pid to tell which process it belongs + * to. The cpl saves the original completion information which + * is used in the completion callback. + * NOTE: these below two fields are only used for admin request. + */ + pid_t pid; + struct spdk_nvme_cpl cpl; + + uint32_t md_size; + + /** + * The following members should not be reordered with members + * above. These members are only needed when splitting + * requests which is done rarely, and the driver is careful + * to not touch the following fields until a split operation is + * needed, to avoid touching an extra cacheline. + */ + + /** + * Points to the outstanding child requests for a parent request. + * Only valid if a request was split into multiple children + * requests, and is not initialized for non-split requests. + */ + TAILQ_HEAD(, nvme_request) children; + + /** + * Linked-list pointers for a child request in its parent's list. + */ + TAILQ_ENTRY(nvme_request) child_tailq; + + /** + * Points to a parent request if part of a split request, + * NULL otherwise. + */ + struct nvme_request *parent; + + /** + * Completion status for a parent request. Initialized to all 0's + * (SUCCESS) before child requests are submitted. If a child + * request completes with error, the error status is copied here, + * to ensure that the parent request is also completed with error + * status once all child requests are completed. + */ + struct spdk_nvme_cpl parent_status; + + /** + * The user_cb_fn and user_cb_arg fields are used for holding the original + * callback data when using nvme_allocate_request_user_copy. + */ + spdk_nvme_cmd_cb user_cb_fn; + void *user_cb_arg; + void *user_buffer; +}; + +struct nvme_completion_poll_status { + struct spdk_nvme_cpl cpl; + bool done; + /* This flag indicates that the request has been timed out and the memory + must be freed in a completion callback */ + bool timed_out; +}; + +struct nvme_async_event_request { + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_request *req; + struct spdk_nvme_cpl cpl; +}; + +enum nvme_qpair_state { + NVME_QPAIR_DISCONNECTED, + NVME_QPAIR_DISCONNECTING, + NVME_QPAIR_CONNECTING, + NVME_QPAIR_CONNECTED, + NVME_QPAIR_ENABLING, + NVME_QPAIR_ENABLED, + NVME_QPAIR_DESTROYING, +}; + +struct spdk_nvme_qpair { + struct spdk_nvme_ctrlr *ctrlr; + + uint16_t id; + + uint8_t qprio; + + uint8_t state : 3; + + /* + * Members for handling IO qpair deletion inside of a completion context. + * These are specifically defined as single bits, so that they do not + * push this data structure out to another cacheline. + */ + uint8_t in_completion_context : 1; + uint8_t delete_after_completion_context: 1; + + /* + * Set when no deletion notification is needed. For example, the process + * which allocated this qpair exited unexpectedly. + */ + uint8_t no_deletion_notification_needed: 1; + + uint8_t first_fused_submitted: 1; + + enum spdk_nvme_transport_type trtype; + + STAILQ_HEAD(, nvme_request) free_req; + STAILQ_HEAD(, nvme_request) queued_req; + STAILQ_HEAD(, nvme_request) aborting_queued_req; + + /* List entry for spdk_nvme_transport_poll_group::qpairs */ + STAILQ_ENTRY(spdk_nvme_qpair) poll_group_stailq; + + /** Commands opcode in this list will return error */ + TAILQ_HEAD(, nvme_error_cmd) err_cmd_head; + /** Requests in this list will return error */ + STAILQ_HEAD(, nvme_request) err_req_head; + + /* List entry for spdk_nvme_ctrlr::active_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) tailq; + + /* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */ + TAILQ_ENTRY(spdk_nvme_qpair) per_process_tailq; + + struct spdk_nvme_ctrlr_process *active_proc; + + struct spdk_nvme_transport_poll_group *poll_group; + + void *poll_group_tailq_head; + + void *req_buf; + + const struct spdk_nvme_transport *transport; + + uint8_t transport_failure_reason: 2; +}; + +struct spdk_nvme_poll_group { + void *ctx; + STAILQ_HEAD(, spdk_nvme_transport_poll_group) tgroups; +}; + +struct spdk_nvme_transport_poll_group { + struct spdk_nvme_poll_group *group; + const struct spdk_nvme_transport *transport; + STAILQ_HEAD(, spdk_nvme_qpair) connected_qpairs; + STAILQ_HEAD(, spdk_nvme_qpair) disconnected_qpairs; + STAILQ_ENTRY(spdk_nvme_transport_poll_group) link; + bool in_completion_context; + uint64_t num_qpairs_to_delete; +}; + +struct spdk_nvme_ns { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t sector_size; + + /* + * Size of data transferred as part of each block, + * including metadata if FLBAS indicates the metadata is transferred + * as part of the data buffer at the end of each LBA. + */ + uint32_t extended_lba_size; + + uint32_t md_size; + uint32_t pi_type; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + uint32_t id; + uint16_t flags; + + /* Namespace Identification Descriptor List (CNS = 03h) */ + uint8_t id_desc_list[4096]; +}; + +/** + * State of struct spdk_nvme_ctrlr (in particular, during initialization). + */ +enum nvme_ctrlr_state { + /** + * Wait before initializing the controller. + */ + NVME_CTRLR_STATE_INIT_DELAY, + + /** + * Controller has not been initialized yet. + */ + NVME_CTRLR_STATE_INIT, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, + + /** + * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1. + */ + NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, + + /** + * Enable the controller by writing CC.EN to 1 + */ + NVME_CTRLR_STATE_ENABLE, + + /** + * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller. + */ + NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, + + /** + * Reset the Admin queue of the controller. + */ + NVME_CTRLR_STATE_RESET_ADMIN_QUEUE, + + /** + * Identify Controller command will be sent to then controller. + */ + NVME_CTRLR_STATE_IDENTIFY, + + /** + * Waiting for Identify Controller command be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, + + /** + * Set Number of Queues of the controller. + */ + NVME_CTRLR_STATE_SET_NUM_QUEUES, + + /** + * Waiting for Set Num of Queues command to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, + + /** + * Construct Namespace data structures of the controller. + */ + NVME_CTRLR_STATE_CONSTRUCT_NS, + + /** + * Get active Namespace list of the controller. + */ + NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, + + /** + * Waiting for the Identify Active Namespace commands to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ACTIVE_NS, + + /** + * Get Identify Namespace Data structure for each NS. + */ + NVME_CTRLR_STATE_IDENTIFY_NS, + + /** + * Waiting for the Identify Namespace commands to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, + + /** + * Get Identify Namespace Identification Descriptors. + */ + NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, + + /** + * Waiting for the Identify Namespace Identification + * Descriptors to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, + + /** + * Configure AER of the controller. + */ + NVME_CTRLR_STATE_CONFIGURE_AER, + + /** + * Waiting for the Configure AER to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, + + /** + * Set supported log pages of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, + + /** + * Set supported features of the controller. + */ + NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, + + /** + * Set Doorbell Buffer Config of the controller. + */ + NVME_CTRLR_STATE_SET_DB_BUF_CFG, + + /** + * Waiting for Doorbell Buffer Config to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, + + /** + * Set Keep Alive Timeout of the controller. + */ + NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, + + /** + * Waiting for Set Keep Alive Timeout to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, + + /** + * Set Host ID of the controller. + */ + NVME_CTRLR_STATE_SET_HOST_ID, + + /** + * Waiting for Set Host ID to be completed. + */ + NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, + + /** + * Controller initialization has completed and the controller is ready. + */ + NVME_CTRLR_STATE_READY, + + /** + * Controller inilialization has an error. + */ + NVME_CTRLR_STATE_ERROR +}; + +#define NVME_TIMEOUT_INFINITE 0 + +/* + * Used to track properties for all processes accessing the controller. + */ +struct spdk_nvme_ctrlr_process { + /** Whether it is the primary process */ + bool is_primary; + + /** Process ID */ + pid_t pid; + + /** Active admin requests to be completed */ + STAILQ_HEAD(, nvme_request) active_reqs; + + TAILQ_ENTRY(spdk_nvme_ctrlr_process) tailq; + + /** Per process PCI device handle */ + struct spdk_pci_device *devhandle; + + /** Reference to track the number of attachment to this controller. */ + int ref; + + /** Allocated IO qpairs */ + TAILQ_HEAD(, spdk_nvme_qpair) allocated_io_qpairs; + + spdk_nvme_aer_cb aer_cb_fn; + void *aer_cb_arg; + + /** + * A function pointer to timeout callback function + */ + spdk_nvme_timeout_cb timeout_cb_fn; + void *timeout_cb_arg; + uint64_t timeout_ticks; +}; + +/* + * One of these per allocated PCI device. + */ +struct spdk_nvme_ctrlr { + /* Hot data (accessed in I/O path) starts here. */ + + /** Array of namespaces indexed by nsid - 1 */ + struct spdk_nvme_ns *ns; + + uint32_t num_ns; + + bool is_removed; + + bool is_resetting; + + bool is_failed; + + bool is_destructed; + + bool timeout_enabled; + + uint16_t max_sges; + + uint16_t cntlid; + + /** Controller support flags */ + uint64_t flags; + + /** NVMEoF in-capsule data size in bytes */ + uint32_t ioccsz_bytes; + + /** NVMEoF in-capsule data offset in 16 byte units */ + uint16_t icdoff; + + /* Cold data (not accessed in normal I/O path) is after this point. */ + + struct spdk_nvme_transport_id trid; + + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + + enum nvme_ctrlr_state state; + uint64_t state_timeout_tsc; + + uint64_t next_keep_alive_tick; + uint64_t keep_alive_interval_ticks; + + TAILQ_ENTRY(spdk_nvme_ctrlr) tailq; + + /** All the log pages supported */ + bool log_page_supported[256]; + + /** All the features supported */ + bool feature_supported[256]; + + /** maximum i/o size in bytes */ + uint32_t max_xfer_size; + + /** minimum page size supported by this controller in bytes */ + uint32_t min_page_size; + + /** selected memory page size for this controller in bytes */ + uint32_t page_size; + + uint32_t num_aers; + struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS]; + + /** guards access to the controller itself, including admin queues */ + pthread_mutex_t ctrlr_lock; + + struct spdk_nvme_qpair *adminq; + + /** shadow doorbell buffer */ + uint32_t *shadow_doorbell; + /** eventidx buffer */ + uint32_t *eventidx; + + /** + * Identify Controller data. + */ + struct spdk_nvme_ctrlr_data cdata; + + /** + * Keep track of active namespaces + */ + uint32_t *active_ns_list; + + /** + * Array of Identify Namespace data. + * + * Stored separately from ns since nsdata should not normally be accessed during I/O. + */ + struct spdk_nvme_ns_data *nsdata; + + struct spdk_bit_array *free_io_qids; + TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs; + + struct spdk_nvme_ctrlr_opts opts; + + uint64_t quirks; + + /* Extra sleep time during controller initialization */ + uint64_t sleep_timeout_tsc; + + /** Track all the processes manage this controller */ + TAILQ_HEAD(, spdk_nvme_ctrlr_process) active_procs; + + + STAILQ_HEAD(, nvme_request) queued_aborts; + uint32_t outstanding_aborts; + + /* CB to notify the user when the ctrlr is removed/failed. */ + spdk_nvme_remove_cb remove_cb; + void *cb_ctx; + + struct spdk_nvme_qpair *external_io_msgs_qpair; + pthread_mutex_t external_io_msgs_lock; + struct spdk_ring *external_io_msgs; + + STAILQ_HEAD(, nvme_io_msg_producer) io_producers; +}; + +struct spdk_nvme_probe_ctx { + struct spdk_nvme_transport_id trid; + void *cb_ctx; + spdk_nvme_probe_cb probe_cb; + spdk_nvme_attach_cb attach_cb; + spdk_nvme_remove_cb remove_cb; + TAILQ_HEAD(, spdk_nvme_ctrlr) init_ctrlrs; +}; + +struct nvme_driver { + pthread_mutex_t lock; + + /** Multi-process shared attached controller list */ + TAILQ_HEAD(, spdk_nvme_ctrlr) shared_attached_ctrlrs; + + bool initialized; + struct spdk_uuid default_extended_host_id; + + /** netlink socket fd for hotplug messages */ + int hotplug_fd; +}; + +extern struct nvme_driver *g_spdk_nvme_driver; + +int nvme_driver_init(void); + +#define nvme_delay usleep + +static inline bool +nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id == 0; +} + +static inline bool +nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair) +{ + return qpair->id != 0; +} + +static inline int +nvme_robust_mutex_lock(pthread_mutex_t *mtx) +{ + int rc = pthread_mutex_lock(mtx); + +#ifndef __FreeBSD__ + if (rc == EOWNERDEAD) { + rc = pthread_mutex_consistent(mtx); + } +#endif + + return rc; +} + +static inline int +nvme_robust_mutex_unlock(pthread_mutex_t *mtx) +{ + return pthread_mutex_unlock(mtx); +} + +/* Poll group management functions. */ +int nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair); + +/* Admin functions */ +int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, + uint8_t cns, uint16_t cntid, uint32_t nsid, + void *payload, size_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr, + uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr, + union spdk_nvme_feat_async_event_configuration config, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, + uint64_t prp1, uint64_t prp2, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn, + void *cb_arg); +int nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr, + const struct spdk_nvme_fw_commit *fw_commit, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr, + uint32_t size, uint32_t offset, void *payload, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +int nvme_ctrlr_cmd_sanitize(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + struct spdk_nvme_sanitize *sanitize, uint32_t cdw11, + spdk_nvme_cmd_cb cb_fn, void *cb_arg); +void nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl); +int nvme_wait_for_completion(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status); +int nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + pthread_mutex_t *robust_mutex); +int nvme_wait_for_completion_timeout(struct spdk_nvme_qpair *qpair, + struct nvme_completion_poll_status *status, + uint64_t timeout_in_secs); + +struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, + pid_t pid); +struct spdk_nvme_ctrlr_process *nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle); +void nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr); +struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, + struct spdk_nvme_probe_ctx *probe_ctx, void *devhandle); + +int nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove); +int nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_connected(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_nvme_ctrlr *ctrlr); + +int nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_request *req); +int nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap); +int nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs); +int nvme_ctrlr_get_cmbsz(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cmbsz_register *cmbsz); +void nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap, + const union spdk_nvme_vs_register *vs); +void nvme_ctrlr_disconnect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests); +void nvme_qpair_deinit(struct spdk_nvme_qpair *qpair); +void nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair); +int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req); +void nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); +uint32_t nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg); +void nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests); + +int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_set_identify_data(struct spdk_nvme_ns *ns); +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr); +void nvme_ns_destruct(struct spdk_nvme_ns *ns); +int nvme_ns_update(struct spdk_nvme_ns *ns); + +int nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); +int nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); +int nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); +int nvme_fabric_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect); +int nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); +int nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_probe_ctx *probe_ctx); +int nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries); + +static inline struct nvme_request * +nvme_allocate_request(struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_size, uint32_t md_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + + req = STAILQ_FIRST(&qpair->free_req); + if (req == NULL) { + return req; + } + + STAILQ_REMOVE_HEAD(&qpair->free_req, stailq); + + /* + * Only memset/zero fields that need it. All other fields + * will be initialized appropriately either later in this + * function, or before they are needed later in the + * submission patch. For example, the children + * TAILQ_ENTRY and following members are + * only used as part of I/O splitting so we avoid + * memsetting them until it is actually needed. + * They will be initialized in nvme_request_add_child() + * if the request is split. + */ + memset(req, 0, offsetof(struct nvme_request, payload_size)); + + req->cb_fn = cb_fn; + req->cb_arg = cb_arg; + req->payload = *payload; + req->payload_size = payload_size; + req->md_size = md_size; + req->pid = g_spdk_nvme_pid; + req->submit_tick = 0; + + return req; +} + +static inline struct nvme_request * +nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_payload payload; + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + return nvme_allocate_request(qpair, &payload, payload_size, 0, cb_fn, cb_arg); +} + +static inline struct nvme_request * +nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg); +} + +struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair, + void *buffer, uint32_t payload_size, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller); + +static inline void +nvme_complete_request(spdk_nvme_cmd_cb cb_fn, void *cb_arg, struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_cpl err_cpl; + struct nvme_error_cmd *cmd; + + /* error injection at completion path, + * only inject for successful completed commands + */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) && + !spdk_nvme_cpl_is_error(cpl))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + + if (cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + + err_cpl = *cpl; + err_cpl.status.sct = cmd->status.sct; + err_cpl.status.sc = cmd->status.sc; + + cpl = &err_cpl; + cmd->err_count--; + break; + } + } + } + + if (cb_fn) { + cb_fn(cb_arg, cpl); + } +} + +static inline void +nvme_free_request(struct nvme_request *req) +{ + assert(req != NULL); + assert(req->num_children == 0); + assert(req->qpair != NULL); + + STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq); +} + +static inline void +nvme_qpair_set_state(struct spdk_nvme_qpair *qpair, enum nvme_qpair_state state) +{ + qpair->state = state; +} + +static inline enum nvme_qpair_state +nvme_qpair_get_state(struct spdk_nvme_qpair *qpair) { + return qpair->state; +} + +static inline void +nvme_qpair_free_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + assert(req != NULL); + assert(req->num_children == 0); + + STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq); +} + +static inline void +nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent != NULL); + assert(child != NULL); + assert(child->parent == parent); + assert(parent->num_children != 0); + + parent->num_children--; + child->parent = NULL; + TAILQ_REMOVE(&parent->children, child, child_tailq); +} + +static inline void +nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl) +{ + struct nvme_request *child = child_arg; + struct nvme_request *parent = child->parent; + + nvme_request_remove_child(parent, child); + + if (spdk_nvme_cpl_is_error(cpl)) { + memcpy(&parent->parent_status, cpl, sizeof(*cpl)); + } + + if (parent->num_children == 0) { + nvme_complete_request(parent->cb_fn, parent->cb_arg, parent->qpair, + parent, &parent->parent_status); + nvme_free_request(parent); + } +} + +static inline void +nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child) +{ + assert(parent->num_children != UINT16_MAX); + + if (parent->num_children == 0) { + /* + * Defer initialization of the children TAILQ since it falls + * on a separate cacheline. This ensures we do not touch this + * cacheline except on request splitting cases, which are + * relatively rare. + */ + TAILQ_INIT(&parent->children); + parent->parent = NULL; + memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl)); + } + + parent->num_children++; + TAILQ_INSERT_TAIL(&parent->children, child, child_tailq); + child->parent = parent; + child->cb_fn = nvme_cb_complete_child; + child->cb_arg = child; +} + +static inline void +nvme_request_free_children(struct nvme_request *req) +{ + struct nvme_request *child, *tmp; + + if (req->num_children == 0) { + return; + } + + /* free all child nvme_request */ + TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) { + nvme_request_remove_child(req, child); + nvme_request_free_children(child); + nvme_free_request(child); + } +} + +int nvme_request_check_timeout(struct nvme_request *req, uint16_t cid, + struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick); +uint64_t nvme_get_quirks(const struct spdk_pci_id *id); + +int nvme_robust_mutex_init_shared(pthread_mutex_t *mtx); +int nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx); + +bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl); + +struct spdk_nvme_ctrlr *nvme_get_ctrlr_by_trid_unsafe( + const struct spdk_nvme_transport_id *trid); + +const struct spdk_nvme_transport *nvme_get_transport(const char *transport_name); +const struct spdk_nvme_transport *nvme_get_first_transport(void); +const struct spdk_nvme_transport *nvme_get_next_transport(const struct spdk_nvme_transport + *transport); + +/* Transport specific functions */ +struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle); +int nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, bool direct_connect); +int nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); +int nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); +int nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); +int nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); +uint32_t nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr); +uint16_t nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr); +struct spdk_nvme_qpair *nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts); +int nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr); +void *nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size); +int nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr); +int nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +int nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +void nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); +void nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); +int nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair); +int nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); +int32_t nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions); +void nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair); +int nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg); + +struct spdk_nvme_transport_poll_group *nvme_transport_poll_group_create( + const struct spdk_nvme_transport *transport); +int nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair); +int nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair); +int64_t nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb); +int nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup); +/* + * Below ref related functions must be called with the global + * driver lock held for the multi-process condition. + * Within these functions, the per ctrlr ctrlr_lock is also + * acquired for the multi-thread condition. + */ +void nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr); +void nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr); +int nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr); + +static inline bool +_is_page_aligned(uint64_t address, uint64_t page_size) +{ + return (address & (page_size - 1)) == 0; +} + +#endif /* __NVME_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvme/nvme_io_msg.c b/src/spdk/lib/nvme/nvme_io_msg.c new file mode 100644 index 000000000..fb5aec3d4 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_io_msg.c @@ -0,0 +1,216 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" +#include "nvme_io_msg.h" + +#define SPDK_NVME_MSG_IO_PROCESS_SIZE 8 + +/** + * Send message to IO queue. + */ +int +nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn, + void *arg) +{ + int rc; + struct spdk_nvme_io_msg *io; + + /* Protect requests ring against preemptive producers */ + pthread_mutex_lock(&ctrlr->external_io_msgs_lock); + + io = (struct spdk_nvme_io_msg *)calloc(1, sizeof(struct spdk_nvme_io_msg)); + if (!io) { + SPDK_ERRLOG("IO msg allocation failed."); + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + return -ENOMEM; + } + + io->ctrlr = ctrlr; + io->nsid = nsid; + io->fn = fn; + io->arg = arg; + + rc = spdk_ring_enqueue(ctrlr->external_io_msgs, (void **)&io, 1, NULL); + if (rc != 1) { + assert(false); + free(io); + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + return -ENOMEM; + } + + pthread_mutex_unlock(&ctrlr->external_io_msgs_lock); + + return 0; +} + +int +nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr) +{ + int i; + int count; + struct spdk_nvme_io_msg *io; + void *requests[SPDK_NVME_MSG_IO_PROCESS_SIZE]; + + if (!ctrlr->external_io_msgs || !ctrlr->external_io_msgs_qpair) { + /* Not ready or pending reset */ + return 0; + } + + spdk_nvme_qpair_process_completions(ctrlr->external_io_msgs_qpair, 0); + + count = spdk_ring_dequeue(ctrlr->external_io_msgs, requests, + SPDK_NVME_MSG_IO_PROCESS_SIZE); + if (count == 0) { + return 0; + } + + for (i = 0; i < count; i++) { + io = requests[i]; + + assert(io != NULL); + + io->fn(io->ctrlr, io->nsid, io->arg); + free(io); + } + + return count; +} + +static bool +nvme_io_msg_is_producer_registered(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + struct nvme_io_msg_producer *tmp; + + STAILQ_FOREACH(tmp, &ctrlr->io_producers, link) { + if (tmp == io_msg_producer) { + return true; + } + } + return false; +} + +int +nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + if (io_msg_producer == NULL) { + SPDK_ERRLOG("io_msg_producer cannot be NULL\n"); + return -EINVAL; + } + + if (nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) { + return -EEXIST; + } + + if (!STAILQ_EMPTY(&ctrlr->io_producers) || ctrlr->is_resetting) { + /* There are registered producers - IO messaging already started */ + STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link); + return 0; + } + + pthread_mutex_init(&ctrlr->external_io_msgs_lock, NULL); + + /** + * Initialize ring and qpair for controller + */ + ctrlr->external_io_msgs = spdk_ring_create(SPDK_RING_TYPE_MP_SC, 65536, SPDK_ENV_SOCKET_ID_ANY); + if (!ctrlr->external_io_msgs) { + SPDK_ERRLOG("Unable to allocate memory for message ring\n"); + return -ENOMEM; + } + + ctrlr->external_io_msgs_qpair = spdk_nvme_ctrlr_alloc_io_qpair(ctrlr, NULL, 0); + if (ctrlr->external_io_msgs_qpair == NULL) { + SPDK_ERRLOG("spdk_nvme_ctrlr_alloc_io_qpair() failed\n"); + spdk_ring_free(ctrlr->external_io_msgs); + ctrlr->external_io_msgs = NULL; + return -ENOMEM; + } + + STAILQ_INSERT_TAIL(&ctrlr->io_producers, io_msg_producer, link); + + return 0; +} + +void +nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_io_msg_producer *io_msg_producer; + + /* Update all producers */ + STAILQ_FOREACH(io_msg_producer, &ctrlr->io_producers, link) { + io_msg_producer->update(ctrlr); + } +} + +void +nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_io_msg_producer *io_msg_producer, *tmp; + + /* Stop all producers */ + STAILQ_FOREACH_SAFE(io_msg_producer, &ctrlr->io_producers, link, tmp) { + io_msg_producer->stop(ctrlr); + STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link); + } + + if (ctrlr->external_io_msgs) { + spdk_ring_free(ctrlr->external_io_msgs); + ctrlr->external_io_msgs = NULL; + } + + if (ctrlr->external_io_msgs_qpair) { + spdk_nvme_ctrlr_free_io_qpair(ctrlr->external_io_msgs_qpair); + ctrlr->external_io_msgs_qpair = NULL; + } + + pthread_mutex_destroy(&ctrlr->external_io_msgs_lock); +} + +void +nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer) +{ + assert(io_msg_producer != NULL); + + if (!nvme_io_msg_is_producer_registered(ctrlr, io_msg_producer)) { + return; + } + + STAILQ_REMOVE(&ctrlr->io_producers, io_msg_producer, nvme_io_msg_producer, link); + if (STAILQ_EMPTY(&ctrlr->io_producers)) { + nvme_io_msg_ctrlr_detach(ctrlr); + } +} diff --git a/src/spdk/lib/nvme/nvme_io_msg.h b/src/spdk/lib/nvme/nvme_io_msg.h new file mode 100644 index 000000000..9c18261d5 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_io_msg.h @@ -0,0 +1,90 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * SPDK cuse + */ + + +#ifndef SPDK_NVME_IO_MSG_H_ +#define SPDK_NVME_IO_MSG_H_ + +typedef void (*spdk_nvme_io_msg_fn)(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, + void *arg); + +struct spdk_nvme_io_msg { + struct spdk_nvme_ctrlr *ctrlr; + uint32_t nsid; + + spdk_nvme_io_msg_fn fn; + void *arg; +}; + +struct nvme_io_msg_producer { + const char *name; + void (*update)(struct spdk_nvme_ctrlr *ctrlr); + void (*stop)(struct spdk_nvme_ctrlr *ctrlr); + STAILQ_ENTRY(nvme_io_msg_producer) link; +}; + +int nvme_io_msg_send(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_io_msg_fn fn, + void *arg); + +/** + * Process IO message sent to controller from external module. + * + * This call process requests from the ring, send IO to an allocated qpair or + * admin commands in its context. This call is non-blocking and intended to be + * polled by SPDK thread to provide safe environment for NVMe request + * completition sent by external module to controller. + * + * The caller must ensure that each controller is polled by only one thread at + * a time. + * + * This function may be called at any point while the controller is attached to + * the SPDK NVMe driver. + * + * \param ctrlr Opaque handle to NVMe controller. + * + * \return number of processed external IO messages. + */ +int nvme_io_msg_process(struct spdk_nvme_ctrlr *ctrlr); + +int nvme_io_msg_ctrlr_register(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer); +void nvme_io_msg_ctrlr_unregister(struct spdk_nvme_ctrlr *ctrlr, + struct nvme_io_msg_producer *io_msg_producer); +void nvme_io_msg_ctrlr_detach(struct spdk_nvme_ctrlr *ctrlr); +void nvme_io_msg_ctrlr_update(struct spdk_nvme_ctrlr *ctrlr); + +#endif /* SPDK_NVME_IO_MSG_H_ */ diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c new file mode 100644 index 000000000..5d424e5c7 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns.c @@ -0,0 +1,401 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static inline struct spdk_nvme_ns_data * +_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return &ns->ctrlr->nsdata[ns->id - 1]; +} + +/** + * Update Namespace flags based on Identify Controller + * and Identify Namespace. This can be also used for + * Namespace Attribute Notice events and Namespace + * operations such as Attach/Detach. + */ +void +nvme_ns_set_identify_data(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + nsdata = _nvme_ns_get_data(ns); + + ns->flags = 0x0000; + + ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads; + ns->extended_lba_size = ns->sector_size; + + ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms; + if (nsdata->flbas.extended) { + ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED; + ns->extended_lba_size += ns->md_size; + } + + ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size; + + if (nsdata->noiob) { + ns->sectors_per_stripe = nsdata->noiob; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING && + ns->ctrlr->cdata.vs[3] != 0) { + ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size / + ns->sector_size; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n", + ns->id, ns->sectors_per_stripe); + } else { + ns->sectors_per_stripe = 0; + } + + if (ns->ctrlr->cdata.oncs.dsm) { + ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.compare) { + ns->flags |= SPDK_NVME_NS_COMPARE_SUPPORTED; + } + + if (ns->ctrlr->cdata.vwc.present) { + ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.write_zeroes) { + ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED; + } + + if (ns->ctrlr->cdata.oncs.write_unc) { + ns->flags |= SPDK_NVME_NS_WRITE_UNCORRECTABLE_SUPPORTED; + } + + if (nsdata->nsrescap.raw) { + ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED; + } + + ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE; + if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) { + ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED; + ns->pi_type = nsdata->dps.pit; + } +} + +static int +nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status *status; + struct spdk_nvme_ns_data *nsdata; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + nsdata = _nvme_ns_get_data(ns); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id, + nsdata, sizeof(*nsdata), + nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, + &ns->ctrlr->ctrlr_lock)) { + if (!status->timed_out) { + free(status); + } + /* This can occur if the namespace is not active. Simply zero the + * namespace data and continue. */ + nvme_ns_destruct(ns); + return 0; + } + free(status); + + nvme_ns_set_identify_data(ns); + + return 0; +} + +static int +nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns) +{ + struct nvme_completion_poll_status *status; + int rc; + + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + + if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) || + (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n"); + return 0; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n"); + rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id, + ns->id_desc_list, sizeof(ns->id_desc_list), + nvme_completion_poll_cb, status); + if (rc < 0) { + free(status); + return rc; + } + + rc = nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, status, &ns->ctrlr->ctrlr_lock); + if (rc != 0) { + SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n"); + memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list)); + } + + if (!status->timed_out) { + free(status); + } + + return rc; +} + +uint32_t +spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns) +{ + return ns->id; +} + +bool +spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns) +{ + const struct spdk_nvme_ns_data *nsdata = NULL; + + /* + * According to the spec, valid NS has non-zero id. + */ + if (ns->id == 0) { + return false; + } + + nsdata = _nvme_ns_get_data(ns); + + /* + * According to the spec, Identify Namespace will return a zero-filled structure for + * inactive namespace IDs. + * Check NCAP since it must be nonzero for an active namespace. + */ + return nsdata->ncap != 0; +} + +struct spdk_nvme_ctrlr * +spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr; +} + +uint32_t +spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns) +{ + return ns->ctrlr->max_xfer_size; +} + +uint32_t +spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->sector_size; +} + +uint32_t +spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns) +{ + return ns->extended_lba_size; +} + +uint64_t +spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns)->nsze; +} + +uint64_t +spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns) +{ + return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns); +} + +uint32_t +spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns) +{ + return ns->flags; +} + +enum spdk_nvme_pi_type +spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) { + return ns->pi_type; +} + +bool +spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns) +{ + return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false; +} + +bool +spdk_nvme_ns_supports_compare(struct spdk_nvme_ns *ns) +{ + return (ns->flags & SPDK_NVME_NS_COMPARE_SUPPORTED) ? true : false; +} + +uint32_t +spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns) +{ + return ns->md_size; +} + +const struct spdk_nvme_ns_data * +spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns) +{ + return _nvme_ns_get_data(ns); +} + +enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value( + struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr; + const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns); + + if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) { + return SPDK_NVME_DEALLOC_READ_00; + } else { + return data->dlfeat.bits.read_value; + } +} + +uint32_t +spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns) +{ + return ns->sectors_per_stripe; +} + +static const void * +nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length) +{ + const struct spdk_nvme_ns_id_desc *desc; + size_t offset; + + offset = 0; + while (offset + 4 < sizeof(ns->id_desc_list)) { + desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset]; + + if (desc->nidl == 0) { + /* End of list */ + return NULL; + } + + /* + * Check if this descriptor fits within the list. + * 4 is the fixed-size descriptor header (not counted in NIDL). + */ + if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) { + /* Descriptor longer than remaining space in list (invalid) */ + return NULL; + } + + if (desc->nidt == type) { + *length = desc->nidl; + return &desc->nid[0]; + } + + offset += 4 + desc->nidl; + } + + return NULL; +} + +const struct spdk_uuid * +spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns) +{ + const struct spdk_uuid *uuid; + size_t uuid_size; + + uuid = nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size); + if (uuid == NULL || uuid_size != sizeof(*uuid)) { + return NULL; + } + + return uuid; +} + +int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id, + struct spdk_nvme_ctrlr *ctrlr) +{ + int rc; + + assert(id > 0); + + ns->ctrlr = ctrlr; + ns->id = id; + + rc = nvme_ctrlr_identify_ns(ns); + if (rc != 0) { + return rc; + } + + return nvme_ctrlr_identify_id_desc(ns); +} + +void nvme_ns_destruct(struct spdk_nvme_ns *ns) +{ + struct spdk_nvme_ns_data *nsdata; + + if (!ns->id) { + return; + } + + nsdata = _nvme_ns_get_data(ns); + memset(nsdata, 0, sizeof(*nsdata)); + ns->sector_size = 0; + ns->extended_lba_size = 0; + ns->md_size = 0; + ns->pi_type = 0; + ns->sectors_per_max_io = 0; + ns->sectors_per_stripe = 0; + ns->flags = 0; +} + +int nvme_ns_update(struct spdk_nvme_ns *ns) +{ + return nvme_ctrlr_identify_ns(ns); +} diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c new file mode 100644 index 000000000..eaa825fa8 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_cmd.c @@ -0,0 +1,1074 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +static inline struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, + void *cb_arg, uint32_t opc, uint32_t io_flags, + uint16_t apptag_mask, uint16_t apptag, bool check_sgl); + + +static bool +nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io, + uint32_t sectors_per_stripe, uint32_t qdepth) +{ + uint32_t child_per_io = UINT32_MAX; + + /* After a namespace is destroyed(e.g. hotplug), all the fields associated with the + * namespace will be cleared to zero, the function will return TRUE for this case, + * and -EINVAL will be returned to caller. + */ + if (sectors_per_stripe > 0) { + child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe; + } else if (sectors_per_max_io > 0) { + child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io); + + return child_per_io >= qdepth; +} + +static struct nvme_request * +_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, + struct nvme_request *parent, bool check_sgl) +{ + struct nvme_request *child; + + child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn, + cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl); + if (child == NULL) { + nvme_request_free_children(parent); + nvme_free_request(parent); + return NULL; + } + + nvme_request_add_child(parent, child); + return child; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint32_t sectors_per_max_io, uint32_t sector_mask, + uint16_t apptag_mask, uint16_t apptag) +{ + uint32_t sector_size; + uint32_t md_size = ns->md_size; + uint32_t remaining_lba_count = lba_count; + struct nvme_request *child; + + sector_size = ns->extended_lba_size; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (md_size == 8)) { + sector_size -= 8; + } + + while (remaining_lba_count > 0) { + lba_count = sectors_per_max_io - (lba & sector_mask); + lba_count = spdk_min(remaining_lba_count, lba_count); + + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, + io_flags, apptag_mask, apptag, req, true); + if (child == NULL) { + return NULL; + } + + remaining_lba_count -= lba_count; + lba += lba_count; + payload_offset += lba_count * sector_size; + md_offset += lba_count * md_size; + } + + return req; +} + +static inline bool +_is_io_flags_valid(uint32_t io_flags) +{ + if (io_flags & ~SPDK_NVME_IO_FLAGS_VALID_MASK) { + /* Invalid io_flags */ + SPDK_ERRLOG("Invalid io_flags 0x%x\n", io_flags); + return false; + } + + return true; +} + +static void +_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req, + uint32_t opc, uint64_t lba, uint32_t lba_count, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct spdk_nvme_cmd *cmd; + + assert(_is_io_flags_valid(io_flags)); + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + *(uint64_t *)&cmd->cdw10 = lba; + + if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) { + switch (ns->pi_type) { + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1: + case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2: + cmd->cdw14 = (uint32_t)lba; + break; + } + } + + cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK); + + cmd->cdw12 = lba_count - 1; + cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK); + + cmd->cdw15 = apptag_mask; + cmd->cdw15 = (cmd->cdw15 << 16 | apptag); +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + bool start_valid, end_valid, last_sge, child_equals_parent; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint32_t page_size = qpair->ctrlr->page_size; + uintptr_t address; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + while (req_current_length < req->payload_size) { + + if (sge_length == 0) { + continue; + } else if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + /* + * The start of the SGE is invalid if the start address is not page aligned, + * unless it is the first SGE in the child request. + */ + start_valid = child_length == 0 || _is_page_aligned(address, page_size); + + /* Boolean for whether this is the last SGE in the parent request. */ + last_sge = (req_current_length + sge_length == req->payload_size); + + /* + * The end of the SGE is invalid if the end address is not page aligned, + * unless it is the last SGE in the parent request. + */ + end_valid = last_sge || _is_page_aligned(address + sge_length, page_size); + + /* + * This child request equals the parent request, meaning that no splitting + * was required for the parent request (the one passed into this function). + * In this case, we do not create a child request at all - we just send + * the original request as a single request at the end of this function. + */ + child_equals_parent = (child_length + sge_length == req->payload_size); + + if (start_valid) { + /* + * The start of the SGE is valid, so advance the length parameters, + * to include this SGE with previous SGEs for this child request + * (if any). If it is not valid, we do not advance the length + * parameters nor get the next SGE, because we must send what has + * been collected before this SGE as a child request. + */ + child_length += sge_length; + req_current_length += sge_length; + if (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + } + /* + * If the next SGE is not page aligned, we will need to create a child + * request for what we have so far, and then start a new child request for + * the next SGE. + */ + start_valid = _is_page_aligned(address, page_size); + } + + if (start_valid && end_valid && !last_sge) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if child_equals_parent allows us to *not* create a child request + * when no splitting is required - in that case we will fall-through and just create + * a single request with no children for the entire I/O. + */ + if (!child_equals_parent) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static struct nvme_request * +_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, + uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, struct nvme_request *req, + uint16_t apptag_mask, uint16_t apptag) +{ + spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn; + spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn; + void *sgl_cb_arg = req->payload.contig_or_cb_arg; + uint64_t child_lba = lba; + uint32_t req_current_length = 0; + uint32_t child_length = 0; + uint32_t sge_length; + uint16_t max_sges, num_sges; + uintptr_t address; + + max_sges = ns->ctrlr->max_sges; + + reset_sgl_fn(sgl_cb_arg, payload_offset); + num_sges = 0; + + while (req_current_length < req->payload_size) { + next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length); + + if (req_current_length + sge_length > req->payload_size) { + sge_length = req->payload_size - req_current_length; + } + + child_length += sge_length; + req_current_length += sge_length; + num_sges++; + + if (num_sges < max_sges && req_current_length < req->payload_size) { + continue; + } + + /* + * We need to create a split here. Send what we have accumulated so far as a child + * request. Checking if the child equals the full payload allows us to *not* + * create a child request when no splitting is required - in that case we will + * fall-through and just create a single request with no children for the entire I/O. + */ + if (child_length != req->payload_size) { + struct nvme_request *child; + uint32_t child_lba_count; + + if ((child_length % ns->extended_lba_size) != 0) { + SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n", + child_length, ns->extended_lba_size); + return NULL; + } + child_lba_count = child_length / ns->extended_lba_size; + /* + * Note the last parameter is set to "false" - this tells the recursive + * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting + * since we have already verified it here. + */ + child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset, + child_lba, child_lba_count, + cb_fn, cb_arg, opc, io_flags, + apptag_mask, apptag, req, false); + if (child == NULL) { + return NULL; + } + payload_offset += child_length; + md_offset += child_lba_count * ns->md_size; + child_lba += child_lba_count; + child_length = 0; + num_sges = 0; + } + } + + if (child_length == req->payload_size) { + /* No splitting was required, so setup the whole payload as one request. */ + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + } + + return req; +} + +static inline struct nvme_request * +_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset, + uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl) +{ + struct nvme_request *req; + uint32_t sector_size; + uint32_t sectors_per_max_io; + uint32_t sectors_per_stripe; + + sector_size = ns->extended_lba_size; + sectors_per_max_io = ns->sectors_per_max_io; + sectors_per_stripe = ns->sectors_per_stripe; + + if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) && + (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) && + (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) && + (ns->md_size == 8)) { + sector_size -= 8; + } + + req = nvme_allocate_request(qpair, payload, lba_count * sector_size, lba_count * ns->md_size, + cb_fn, cb_arg); + if (req == NULL) { + return NULL; + } + + req->payload_offset = payload_offset; + req->md_offset = md_offset; + + /* + * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping. + * If this controller defines a stripe boundary and this I/O spans a stripe + * boundary, split the request into multiple requests and submit each + * separately to hardware. + */ + if (sectors_per_stripe > 0 && + (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) { + + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag); + } else if (lba_count > sectors_per_max_io) { + return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, + cb_fn, + cb_arg, opc, + io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) { + if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) { + return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } else { + return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset, + lba, lba_count, cb_fn, cb_arg, opc, io_flags, + req, apptag_mask, apptag); + } + } + + _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag); + return req; +} + +int +spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_comparev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, + SPDK_NVME_OPC_COMPARE, io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, + 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer, + void *metadata, + uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, + apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, uint64_t lba, + uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, 0, 0, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags, + spdk_nvme_req_reset_sgl_cb reset_sgl_fn, + spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata, + uint16_t apptag_mask, uint16_t apptag) +{ + struct nvme_request *req; + struct nvme_payload payload; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (reset_sgl_fn == NULL || next_sge_fn == NULL) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata); + + req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE, + io_flags, apptag_mask, apptag, true); + if (req != NULL) { + return nvme_qpair_submit_request(qpair, req); + } else if (nvme_ns_check_request_length(lba_count, + ns->sectors_per_max_io, + ns->sectors_per_stripe, + qpair->ctrlr->opts.io_queue_requests)) { + return -EINVAL; + } else { + return -ENOMEM; + } +} + +int +spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint64_t *tmp_lba; + + if (!_is_io_flags_valid(io_flags)) { + return -EINVAL; + } + + if (lba_count == 0 || lba_count > UINT16_MAX + 1) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES; + cmd->nsid = ns->id; + + tmp_lba = (uint64_t *)&cmd->cdw10; + *tmp_lba = lba; + cmd->cdw12 = lba_count - 1; + cmd->fuse = (io_flags & SPDK_NVME_IO_FLAGS_FUSE_MASK); + cmd->cdw12 |= (io_flags & SPDK_NVME_IO_FLAGS_CDW12_MASK); + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_write_uncorrectable(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint64_t lba, uint32_t lba_count, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + uint64_t *tmp_lba; + + if (lba_count == 0 || lba_count > UINT16_MAX + 1) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_WRITE_UNCORRECTABLE; + cmd->nsid = ns->id; + + tmp_lba = (uint64_t *)&cmd->cdw10; + *tmp_lba = lba; + cmd->cdw12 = lba_count - 1; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + uint32_t type, + const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) { + return -EINVAL; + } + + if (ranges == NULL) { + return -EINVAL; + } + + req = nvme_allocate_request_user_copy(qpair, (void *)ranges, + num_ranges * sizeof(struct spdk_nvme_dsm_range), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = ns->id; + + cmd->cdw10_bits.dsm.nr = num_ranges - 1; + cmd->cdw11 = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_FLUSH; + cmd->nsid = ns->id; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_register_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_register_action action, + enum spdk_nvme_reservation_register_cptpl cptpl, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_register_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_register.rrega = action; + cmd->cdw10_bits.resv_register.iekey = ignore_key; + cmd->cdw10_bits.resv_register.cptpl = cptpl; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_key_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_release_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn, + cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_release.rrela = action; + cmd->cdw10_bits.resv_release.iekey = ignore_key; + cmd->cdw10_bits.resv_release.rtype = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + struct spdk_nvme_reservation_acquire_data *payload, + bool ignore_key, + enum spdk_nvme_reservation_acquire_action action, + enum spdk_nvme_reservation_type type, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_user_copy(qpair, + payload, sizeof(struct spdk_nvme_reservation_acquire_data), + cb_fn, cb_arg, true); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE; + cmd->nsid = ns->id; + + cmd->cdw10_bits.resv_acquire.racqa = action; + cmd->cdw10_bits.resv_acquire.iekey = ignore_key; + cmd->cdw10_bits.resv_acquire.rtype = type; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *payload, uint32_t len, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + uint32_t num_dwords; + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (len % 4) { + return -EINVAL; + } + num_dwords = len / 4; + + req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT; + cmd->nsid = ns->id; + + cmd->cdw10 = num_dwords; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c new file mode 100644 index 000000000..f60aa6789 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c @@ -0,0 +1,233 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/nvme_ocssd.h" +#include "nvme_internal.h" + +int +spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *lba_list, uint32_t num_lbas, + struct spdk_ocssd_chunk_information_entry *chunk_info, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + if (!lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET; + cmd->nsid = ns->id; + + if (chunk_info != NULL) { + cmd->mptr = spdk_vtophys(chunk_info, NULL); + } + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + + return nvme_qpair_submit_request(qpair, req); +} + +static int +_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + enum spdk_ocssd_io_opcode opc, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + struct nvme_payload payload; + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!buffer || !lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + payload = NVME_PAYLOAD_CONTIG(buffer, metadata); + + req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, num_lbas * ns->md_size, + cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = opc; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of logical block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, void *metadata, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + void *buffer, + uint64_t *lba_list, uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list, + num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags); +} + +int +spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns, + struct spdk_nvme_qpair *qpair, + uint64_t *dst_lba_list, + uint64_t *src_lba_list, + uint32_t num_lbas, + spdk_nvme_cmd_cb cb_fn, void *cb_arg, + uint32_t io_flags) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY; + + if (io_flags & ~valid_flags) { + return -EINVAL; + } + + if (!dst_lba_list || !src_lba_list || (num_lbas == 0) || + (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) { + return -EINVAL; + } + + req = nvme_allocate_request_null(qpair, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY; + cmd->nsid = ns->id; + + /* + * Dword 10 and 11 store a pointer to the list of source logical + * block addresses. + * Dword 14 and 15 store a pointer to the list of destination logical + * block addresses. + * If there is a single entry in the LBA list, the logical block + * address should be stored instead. + */ + if (num_lbas == 1) { + *(uint64_t *)&cmd->cdw10 = *src_lba_list; + *(uint64_t *)&cmd->cdw14 = *dst_lba_list; + } else { + *(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list, NULL); + *(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list, NULL); + } + + cmd->cdw12 = num_lbas - 1; + cmd->cdw12 |= io_flags; + + return nvme_qpair_submit_request(qpair, req); +} diff --git a/src/spdk/lib/nvme/nvme_opal.c b/src/spdk/lib/nvme/nvme_opal.c new file mode 100644 index 000000000..e0a3aa7fa --- /dev/null +++ b/src/spdk/lib/nvme/nvme_opal.c @@ -0,0 +1,2566 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "spdk/opal.h" +#include "spdk_internal/log.h" +#include "spdk/util.h" + +#include "nvme_opal_internal.h" + +static void +opal_nvme_security_recv_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct opal_session *sess = arg; + struct spdk_opal_dev *dev = sess->dev; + void *response = sess->resp; + struct spdk_opal_compacket *header = response; + int ret; + + if (spdk_nvme_cpl_is_error(cpl)) { + sess->sess_cb(sess, -EIO, sess->cb_arg); + return; + } + + if (!header->outstanding_data && !header->min_transfer) { + sess->sess_cb(sess, 0, sess->cb_arg); + return; + } + + memset(response, 0, IO_BUFFER_LENGTH); + ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, + dev->comid, 0, sess->resp, IO_BUFFER_LENGTH, + opal_nvme_security_recv_done, sess); + if (ret) { + sess->sess_cb(sess, ret, sess->cb_arg); + } +} + +static void +opal_nvme_security_send_done(void *arg, const struct spdk_nvme_cpl *cpl) +{ + struct opal_session *sess = arg; + struct spdk_opal_dev *dev = sess->dev; + int ret; + + if (spdk_nvme_cpl_is_error(cpl)) { + sess->sess_cb(sess, -EIO, sess->cb_arg); + return; + } + + ret = spdk_nvme_ctrlr_cmd_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, + dev->comid, 0, sess->resp, IO_BUFFER_LENGTH, + opal_nvme_security_recv_done, sess); + if (ret) { + sess->sess_cb(sess, ret, sess->cb_arg); + } +} + +static int +opal_nvme_security_send(struct spdk_opal_dev *dev, struct opal_session *sess, + opal_sess_cb sess_cb, void *cb_arg) +{ + sess->sess_cb = sess_cb; + sess->cb_arg = cb_arg; + + return spdk_nvme_ctrlr_cmd_security_send(dev->ctrlr, SPDK_SCSI_SECP_TCG, dev->comid, + 0, sess->cmd, IO_BUFFER_LENGTH, + opal_nvme_security_send_done, sess); +} + +static void +opal_send_recv_done(struct opal_session *sess, int status, void *ctx) +{ + sess->status = status; + sess->done = true; +} + +static int +opal_send_recv(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int ret; + + sess->done = false; + ret = opal_nvme_security_send(dev, sess, opal_send_recv_done, NULL); + if (ret) { + return ret; + } + + while (!sess->done) { + spdk_nvme_ctrlr_process_admin_completions(dev->ctrlr); + } + + return sess->status; +} + +static struct opal_session * +opal_alloc_session(struct spdk_opal_dev *dev) +{ + struct opal_session *sess; + + sess = calloc(1, sizeof(*sess)); + if (!sess) { + return NULL; + } + sess->dev = dev; + + return sess; +} + +static void +opal_add_token_u8(int *err, struct opal_session *sess, uint8_t token) +{ + if (*err) { + return; + } + if (sess->cmd_pos >= IO_BUFFER_LENGTH - 1) { + SPDK_ERRLOG("Error adding u8: end of buffer.\n"); + *err = -ERANGE; + return; + } + sess->cmd[sess->cmd_pos++] = token; +} + +static void +opal_add_short_atom_header(struct opal_session *sess, bool bytestring, + bool has_sign, size_t len) +{ + uint8_t atom; + int err = 0; + + atom = SPDK_SHORT_ATOM_ID; + atom |= bytestring ? SPDK_SHORT_ATOM_BYTESTRING_FLAG : 0; + atom |= has_sign ? SPDK_SHORT_ATOM_SIGN_FLAG : 0; + atom |= len & SPDK_SHORT_ATOM_LEN_MASK; + + opal_add_token_u8(&err, sess, atom); +} + +static void +opal_add_medium_atom_header(struct opal_session *sess, bool bytestring, + bool has_sign, size_t len) +{ + uint8_t header; + + header = SPDK_MEDIUM_ATOM_ID; + header |= bytestring ? SPDK_MEDIUM_ATOM_BYTESTRING_FLAG : 0; + header |= has_sign ? SPDK_MEDIUM_ATOM_SIGN_FLAG : 0; + header |= (len >> 8) & SPDK_MEDIUM_ATOM_LEN_MASK; + sess->cmd[sess->cmd_pos++] = header; + sess->cmd[sess->cmd_pos++] = len; +} + +static void +opal_add_token_bytestring(int *err, struct opal_session *sess, + const uint8_t *bytestring, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (*err) { + return; + } + + if (len & ~SPDK_SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (len >= IO_BUFFER_LENGTH - sess->cmd_pos - header_len) { + SPDK_ERRLOG("Error adding bytestring: end of buffer.\n"); + *err = -ERANGE; + return; + } + + if (is_short_atom) { + opal_add_short_atom_header(sess, true, false, len); + } else { + opal_add_medium_atom_header(sess, true, false, len); + } + + memcpy(&sess->cmd[sess->cmd_pos], bytestring, len); + sess->cmd_pos += len; +} + +static void +opal_add_token_u64(int *err, struct opal_session *sess, uint64_t number) +{ + int startat = 0; + + if (*err) { + return; + } + + /* add header first */ + if (number <= SPDK_TINY_ATOM_DATA_MASK) { + sess->cmd[sess->cmd_pos++] = (uint8_t) number & SPDK_TINY_ATOM_DATA_MASK; + } else { + if (number < 0x100) { + sess->cmd[sess->cmd_pos++] = 0x81; /* short atom, 1 byte length */ + startat = 0; + } else if (number < 0x10000) { + sess->cmd[sess->cmd_pos++] = 0x82; /* short atom, 2 byte length */ + startat = 1; + } else if (number < 0x100000000) { + sess->cmd[sess->cmd_pos++] = 0x84; /* short atom, 4 byte length */ + startat = 3; + } else { + sess->cmd[sess->cmd_pos++] = 0x88; /* short atom, 8 byte length */ + startat = 7; + } + + /* add number value */ + for (int i = startat; i > -1; i--) { + sess->cmd[sess->cmd_pos++] = (uint8_t)((number >> (i * 8)) & 0xff); + } + } +} + +static void +opal_add_tokens(int *err, struct opal_session *sess, int num, ...) +{ + int i; + va_list args_ptr; + enum spdk_opal_token tmp; + + va_start(args_ptr, num); + + for (i = 0; i < num; i++) { + tmp = va_arg(args_ptr, enum spdk_opal_token); + opal_add_token_u8(err, sess, tmp); + if (*err != 0) { break; } + } + + va_end(args_ptr); +} + +static int +opal_cmd_finalize(struct opal_session *sess, uint32_t hsn, uint32_t tsn, bool eod) +{ + struct spdk_opal_header *hdr; + int err = 0; + + if (eod) { + opal_add_tokens(&err, sess, 6, SPDK_OPAL_ENDOFDATA, + SPDK_OPAL_STARTLIST, + 0, 0, 0, + SPDK_OPAL_ENDLIST); + } + + if (err) { + SPDK_ERRLOG("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct spdk_opal_header *)sess->cmd; + + to_be32(&hdr->packet.session_tsn, tsn); + to_be32(&hdr->packet.session_hsn, hsn); + + to_be32(&hdr->sub_packet.length, sess->cmd_pos - sizeof(*hdr)); + while (sess->cmd_pos % 4) { + if (sess->cmd_pos >= IO_BUFFER_LENGTH) { + SPDK_ERRLOG("Error: Buffer overrun\n"); + return -ERANGE; + } + sess->cmd[sess->cmd_pos++] = 0; + } + to_be32(&hdr->packet.length, sess->cmd_pos - sizeof(hdr->com_packet) - + sizeof(hdr->packet)); + to_be32(&hdr->com_packet.length, sess->cmd_pos - sizeof(hdr->com_packet)); + + return 0; +} + +static size_t +opal_response_parse_tiny(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = 1; + token->width = OPAL_WIDTH_TINY; + + if (pos[0] & SPDK_TINY_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + token->stored.unsigned_num = pos[0] & SPDK_TINY_ATOM_DATA_MASK; + } + + return token->len; +} + +static int +opal_response_parse_short(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = (pos[0] & SPDK_SHORT_ATOM_LEN_MASK) + 1; /* plus 1-byte header */ + token->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SPDK_SHORT_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_SHORT_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + uint64_t u_integer = 0; + size_t i, b = 0; + + token->type = OPAL_DTA_TOKENID_UINT; + if (token->len > 9) { + SPDK_ERRLOG("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = token->len - 1; i > 0; i--) { + u_integer |= ((uint64_t)pos[i] << (8 * b)); + b++; + } + token->stored.unsigned_num = u_integer; + } + + return token->len; +} + +static size_t +opal_response_parse_medium(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = (((pos[0] & SPDK_MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; /* plus 2-byte header */ + token->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & SPDK_MEDIUM_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_MEDIUM_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + } + + return token->len; +} + +static size_t +opal_response_parse_long(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; /* plus 4-byte header */ + token->width = OPAL_WIDTH_LONG; + + if (pos[0] & SPDK_LONG_ATOM_BYTESTRING_FLAG) { + token->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SPDK_LONG_ATOM_SIGN_FLAG) { + token->type = OPAL_DTA_TOKENID_SINT; + } else { + token->type = OPAL_DTA_TOKENID_UINT; + } + + return token->len; +} + +static size_t +opal_response_parse_token(struct spdk_opal_resp_token *token, + const uint8_t *pos) +{ + token->pos = pos; + token->len = 1; + token->type = OPAL_DTA_TOKENID_TOKEN; + token->width = OPAL_WIDTH_TOKEN; + + return token->len; +} + +static int +opal_response_parse(const uint8_t *buf, size_t length, + struct spdk_opal_resp_parsed *resp) +{ + const struct spdk_opal_header *hdr; + struct spdk_opal_resp_token *token_iter; + int num_entries = 0; + int total; + size_t token_length; + const uint8_t *pos; + uint32_t clen, plen, slen; + + if (!buf || !resp) { + return -EINVAL; + } + + hdr = (struct spdk_opal_header *)buf; + pos = buf + sizeof(*hdr); + + clen = from_be32(&hdr->com_packet.length); + plen = from_be32(&hdr->packet.length); + slen = from_be32(&hdr->sub_packet.length); + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "Response size: cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + SPDK_ERRLOG("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + return -EINVAL; + } + + if (pos > buf + length) { + SPDK_ERRLOG("Pointer out of range\n"); + return -EFAULT; + } + + token_iter = resp->resp_tokens; + total = slen; + + while (total > 0) { + if (pos[0] <= SPDK_TINY_ATOM_TYPE_MAX) { /* tiny atom */ + token_length = opal_response_parse_tiny(token_iter, pos); + } else if (pos[0] <= SPDK_SHORT_ATOM_TYPE_MAX) { /* short atom */ + token_length = opal_response_parse_short(token_iter, pos); + } else if (pos[0] <= SPDK_MEDIUM_ATOM_TYPE_MAX) { /* medium atom */ + token_length = opal_response_parse_medium(token_iter, pos); + } else if (pos[0] <= SPDK_LONG_ATOM_TYPE_MAX) { /* long atom */ + token_length = opal_response_parse_long(token_iter, pos); + } else { /* TOKEN */ + token_length = opal_response_parse_token(token_iter, pos); + } + + if (token_length <= 0) { + SPDK_ERRLOG("Parse response failure.\n"); + return -EINVAL; + } + + pos += token_length; + total -= token_length; + token_iter++; + num_entries++; + + if (total < 0) { + SPDK_ERRLOG("Length not matching.\n"); + return -EINVAL; + } + } + + if (num_entries == 0) { + SPDK_ERRLOG("Couldn't parse response.\n"); + return -EINVAL; + } + resp->num = num_entries; + + return 0; +} + +static inline bool +opal_response_token_matches(const struct spdk_opal_resp_token *token, + uint8_t match) +{ + if (!token || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) { + return false; + } + return true; +} + +static const struct spdk_opal_resp_token * +opal_response_get_token(const struct spdk_opal_resp_parsed *resp, int index) +{ + const struct spdk_opal_resp_token *token; + + if (index >= resp->num) { + SPDK_ERRLOG("Token number doesn't exist: %d, resp: %d\n", + index, resp->num); + return NULL; + } + + token = &resp->resp_tokens[index]; + if (token->len == 0) { + SPDK_ERRLOG("Token length must be non-zero\n"); + return NULL; + } + + return token; +} + +static uint64_t +opal_response_get_u64(const struct spdk_opal_resp_parsed *resp, int index) +{ + if (!resp) { + SPDK_ERRLOG("Response is NULL\n"); + return 0; + } + + if (resp->resp_tokens[index].type != OPAL_DTA_TOKENID_UINT) { + SPDK_ERRLOG("Token is not unsigned int: %d\n", + resp->resp_tokens[index].type); + return 0; + } + + if (!(resp->resp_tokens[index].width == OPAL_WIDTH_TINY || + resp->resp_tokens[index].width == OPAL_WIDTH_SHORT)) { + SPDK_ERRLOG("Atom is not short or tiny: %d\n", + resp->resp_tokens[index].width); + return 0; + } + + return resp->resp_tokens[index].stored.unsigned_num; +} + +static uint16_t +opal_response_get_u16(const struct spdk_opal_resp_parsed *resp, int index) +{ + uint64_t i = opal_response_get_u64(resp, index); + if (i > 0xffffull) { + SPDK_ERRLOG("parse reponse u16 failed. Overflow\n"); + return 0; + } + return (uint16_t) i; +} + +static uint8_t +opal_response_get_u8(const struct spdk_opal_resp_parsed *resp, int index) +{ + uint64_t i = opal_response_get_u64(resp, index); + if (i > 0xffull) { + SPDK_ERRLOG("parse reponse u8 failed. Overflow\n"); + return 0; + } + return (uint8_t) i; +} + +static size_t +opal_response_get_string(const struct spdk_opal_resp_parsed *resp, int n, + const char **store) +{ + uint8_t header_len; + struct spdk_opal_resp_token token; + *store = NULL; + if (!resp) { + SPDK_ERRLOG("Response is NULL\n"); + return 0; + } + + if (n > resp->num) { + SPDK_ERRLOG("Response has %d tokens. Can't access %d\n", + resp->num, n); + return 0; + } + + token = resp->resp_tokens[n]; + if (token.type != OPAL_DTA_TOKENID_BYTESTRING) { + SPDK_ERRLOG("Token is not a byte string!\n"); + return 0; + } + + switch (token.width) { + case OPAL_WIDTH_SHORT: + header_len = 1; + break; + case OPAL_WIDTH_MEDIUM: + header_len = 2; + break; + case OPAL_WIDTH_LONG: + header_len = 4; + break; + default: + SPDK_ERRLOG("Can't get string from this Token\n"); + return 0; + } + + *store = token.pos + header_len; + return token.len - header_len; +} + +static int +opal_response_status(const struct spdk_opal_resp_parsed *resp) +{ + const struct spdk_opal_resp_token *tok; + + /* if we get an EOS token, just return 0 */ + tok = opal_response_get_token(resp, 0); + if (opal_response_token_matches(tok, SPDK_OPAL_ENDOFSESSION)) { + return 0; + } + + if (resp->num < 5) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + tok = opal_response_get_token(resp, resp->num - 5); /* the first token should be STARTLIST */ + if (!opal_response_token_matches(tok, SPDK_OPAL_STARTLIST)) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + tok = opal_response_get_token(resp, resp->num - 1); /* the last token should be ENDLIST */ + if (!opal_response_token_matches(tok, SPDK_OPAL_ENDLIST)) { + return SPDK_DTAERROR_NO_METHOD_STATUS; + } + + /* The second and third values in the status list are reserved, and are + defined in core spec to be 0x00 and 0x00 and SHOULD be ignored by the host. */ + return (int)opal_response_get_u64(resp, + resp->num - 4); /* We only need the first value in the status list. */ +} + +static int +opal_parse_and_check_status(struct opal_session *sess) +{ + int error; + + error = opal_response_parse(sess->resp, IO_BUFFER_LENGTH, &sess->parsed_resp); + if (error) { + SPDK_ERRLOG("Couldn't parse response.\n"); + return error; + } + return opal_response_status(&sess->parsed_resp); +} + +static inline void +opal_clear_cmd(struct opal_session *sess) +{ + sess->cmd_pos = sizeof(struct spdk_opal_header); + memset(sess->cmd, 0, IO_BUFFER_LENGTH); +} + +static inline void +opal_set_comid(struct opal_session *sess, uint16_t comid) +{ + struct spdk_opal_header *hdr = (struct spdk_opal_header *)sess->cmd; + + hdr->com_packet.comid[0] = comid >> 8; + hdr->com_packet.comid[1] = comid; + hdr->com_packet.extended_comid[0] = 0; + hdr->com_packet.extended_comid[1] = 0; +} + +static inline int +opal_init_key(struct spdk_opal_key *opal_key, const char *passwd) +{ + int len; + + if (passwd == NULL || passwd[0] == '\0') { + SPDK_ERRLOG("Password is empty. Create key failed\n"); + return -EINVAL; + } + + len = strlen(passwd); + + if (len >= OPAL_KEY_MAX) { + SPDK_ERRLOG("Password too long. Create key failed\n"); + return -EINVAL; + } + + opal_key->key_len = len; + memcpy(opal_key->key, passwd, opal_key->key_len); + + return 0; +} + +static void +opal_build_locking_range(uint8_t *buffer, uint8_t locking_range) +{ + memcpy(buffer, spdk_opal_uid[UID_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + /* global */ + if (locking_range == 0) { + return; + } + + /* non-global */ + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = locking_range; +} + +static void +opal_check_tper(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_tper_feat *tper = data; + + dev->feat_info.tper = *tper; +} + +/* + * check single user mode + */ +static bool +opal_check_sum(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_single_user_mode_feat *sum = data; + uint32_t num_locking_objects = from_be32(&sum->num_locking_objects); + + if (num_locking_objects == 0) { + SPDK_NOTICELOG("Need at least one locking object.\n"); + return false; + } + + dev->feat_info.single_user = *sum; + + return true; +} + +static void +opal_check_lock(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_locking_feat *lock = data; + + dev->feat_info.locking = *lock; +} + +static void +opal_check_geometry(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_geo_feat *geo = data; + + dev->feat_info.geo = *geo; +} + +static void +opal_check_datastore(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_datastore_feat *datastore = data; + + dev->feat_info.datastore = *datastore; +} + +static uint16_t +opal_get_comid_v100(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_v100_feat *v100 = data; + uint16_t base_comid = from_be16(&v100->base_comid); + + dev->feat_info.v100 = *v100; + + return base_comid; +} + +static uint16_t +opal_get_comid_v200(struct spdk_opal_dev *dev, const void *data) +{ + const struct spdk_opal_d0_v200_feat *v200 = data; + uint16_t base_comid = from_be16(&v200->base_comid); + + dev->feat_info.v200 = *v200; + + return base_comid; +} + +static int +opal_discovery0_end(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size) +{ + bool supported = false, single_user = false; + const struct spdk_opal_d0_hdr *hdr = (struct spdk_opal_d0_hdr *)payload; + struct spdk_opal_d0_feat_hdr *feat_hdr; + const uint8_t *epos = payload, *cpos = payload; + uint16_t comid = 0; + uint32_t hlen = from_be32(&(hdr->length)); + + if (hlen > payload_size - sizeof(*hdr)) { + SPDK_ERRLOG("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, payload_size); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos) { + feat_hdr = (struct spdk_opal_d0_feat_hdr *)cpos; + uint16_t feat_code = from_be16(&feat_hdr->code); + + switch (feat_code) { + case FEATURECODE_TPER: + opal_check_tper(dev, cpos); + break; + case FEATURECODE_SINGLEUSER: + single_user = opal_check_sum(dev, cpos); + break; + case FEATURECODE_GEOMETRY: + opal_check_geometry(dev, cpos); + break; + case FEATURECODE_LOCKING: + opal_check_lock(dev, cpos); + break; + case FEATURECODE_DATASTORE: + opal_check_datastore(dev, cpos); + break; + case FEATURECODE_OPALV100: + comid = opal_get_comid_v100(dev, cpos); + supported = true; + break; + case FEATURECODE_OPALV200: + comid = opal_get_comid_v200(dev, cpos); + supported = true; + break; + default: + SPDK_INFOLOG(SPDK_LOG_OPAL, "Unknow feature code: %d\n", feat_code); + } + cpos += feat_hdr->length + sizeof(*feat_hdr); + } + + if (supported == false) { + SPDK_ERRLOG("Opal Not Supported.\n"); + return -ENOTSUP; + } + + if (single_user == false) { + SPDK_INFOLOG(SPDK_LOG_OPAL, "Single User Mode Not Supported\n"); + } + + dev->comid = comid; + return 0; +} + +static int +opal_discovery0(struct spdk_opal_dev *dev, void *payload, uint32_t payload_size) +{ + int ret; + + ret = spdk_nvme_ctrlr_security_receive(dev->ctrlr, SPDK_SCSI_SECP_TCG, LV0_DISCOVERY_COMID, + 0, payload, payload_size); + if (ret) { + return ret; + } + + return opal_discovery0_end(dev, payload, payload_size); +} + +static int +opal_end_session(struct spdk_opal_dev *dev, struct opal_session *sess, uint16_t comid) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, comid); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDOFSESSION); + + if (err < 0) { + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, false); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + sess->hsn = 0; + sess->tsn = 0; + + return opal_parse_and_check_status(sess); +} + +void +spdk_opal_dev_destruct(struct spdk_opal_dev *dev) +{ + free(dev); +} + +static int +opal_start_session_done(struct opal_session *sess) +{ + uint32_t hsn, tsn; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + hsn = opal_response_get_u64(&sess->parsed_resp, 4); + tsn = opal_response_get_u64(&sess->parsed_resp, 5); + + if (hsn == 0 && tsn == 0) { + SPDK_ERRLOG("Couldn't authenticate session\n"); + return -EPERM; + } + + sess->hsn = hsn; + sess->tsn = tsn; + + return 0; +} + +static int +opal_start_generic_session(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum opal_uid_enum auth, + enum opal_uid_enum sp_type, + const char *key, + uint8_t key_len) +{ + uint32_t hsn; + int err = 0; + int ret; + + if (key == NULL && auth != UID_ANYBODY) { + return OPAL_INVAL_PARAM; + } + + opal_clear_cmd(sess); + + opal_set_comid(sess, dev->comid); + hsn = GENERIC_HOST_SESSION_NUM; + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u64(&err, sess, hsn); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[sp_type], OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_TRUE); /* Write */ + + switch (auth) { + case UID_ANYBODY: + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + break; + case UID_ADMIN1: + case UID_SID: + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTNAME); + opal_add_token_u8(&err, sess, 0); /* HostChallenge */ + opal_add_token_bytestring(&err, sess, key, key_len); + opal_add_tokens(&err, sess, 3, /* number of token */ + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + 3);/* HostSignAuth */ + opal_add_token_bytestring(&err, sess, spdk_opal_uid[auth], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDNAME); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + break; + default: + SPDK_ERRLOG("Cannot start Admin SP session with auth %d\n", auth); + return -EINVAL; + } + + if (err) { + SPDK_ERRLOG("Error building start adminsp session command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_start_session_done(sess); +} + +static int +opal_get_msid_cpin_pin_done(struct opal_session *sess, + struct spdk_opal_key *opal_key) +{ + const char *msid_pin; + size_t strlen; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + strlen = opal_response_get_string(&sess->parsed_resp, 4, &msid_pin); + if (!msid_pin) { + SPDK_ERRLOG("Couldn't extract PIN from response\n"); + return -EINVAL; + } + + opal_key->key_len = strlen; + memcpy(opal_key->key, msid_pin, opal_key->key_len); + + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "MSID = %p\n", opal_key->key); + return 0; +} + +static int +opal_get_msid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, + struct spdk_opal_key *opal_key) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_C_PIN_MSID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_PIN, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_PIN, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building Get MSID CPIN PIN command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_msid_cpin_pin_done(sess, opal_key); +} + +static int +opal_build_generic_pw_cmd(struct opal_session *sess, uint8_t *key, size_t key_len, + uint8_t *cpin_uid, struct spdk_opal_dev *dev) +{ + int err = 0; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, cpin_uid, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 6, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_PIN); + opal_add_token_bytestring(&err, sess, key, key_len); + opal_add_tokens(&err, sess, 4, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + return err; + } + + return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); +} + +static int +opal_get_locking_sp_lifecycle_done(struct opal_session *sess) +{ + uint8_t lifecycle; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + lifecycle = opal_response_get_u64(&sess->parsed_resp, 4); + if (lifecycle != OPAL_MANUFACTURED_INACTIVE) { /* status before activate */ + SPDK_ERRLOG("Couldn't determine the status of the Lifecycle state\n"); + return -EINVAL; + } + + return 0; +} + +static int +opal_get_locking_sp_lifecycle(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_LIFECYCLE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_LIFECYCLE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building GET Lifecycle Status command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_locking_sp_lifecycle_done(sess); +} + +static int +opal_activate(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[ACTIVATE_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building Activate LockingSP command.\n"); + return err; + } + + /* TODO: Single User Mode for activatation */ + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_start_auth_session(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_user user, + struct spdk_opal_key *opal_key) +{ + uint8_t uid_user[OPAL_UID_LENGTH]; + int err = 0; + int ret; + uint32_t hsn = GENERIC_HOST_SESSION_NUM; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + if (user != OPAL_ADMIN1) { + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + } else { + memcpy(uid_user, spdk_opal_uid[UID_ADMIN1], OPAL_UID_LENGTH); + } + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_SMUID], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[STARTSESSION_METHOD], + OPAL_UID_LENGTH); + + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u64(&err, sess, hsn); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKINGSP], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 3, SPDK_OPAL_TRUE, SPDK_OPAL_STARTNAME, + 0); /* True for a Read-Write session */ + opal_add_token_bytestring(&err, sess, opal_key->key, opal_key->key_len); + opal_add_tokens(&err, sess, 3, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME, 3); /* HostSignAuth */ + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building STARTSESSION command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_start_session_done(sess); +} + +static int +opal_lock_unlock_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + enum spdk_opal_lock_state l_state) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + uint8_t read_locked, write_locked; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + switch (l_state) { + case OPAL_READONLY: + read_locked = 0; + write_locked = 1; + break; + case OPAL_READWRITE: + read_locked = 0; + write_locked = 0; + break; + case OPAL_RWLOCK: + read_locked = 1; + write_locked = 1; + break; + default: + SPDK_ERRLOG("Tried to set an invalid locking state.\n"); + return -EINVAL; + } + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 15, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKED, + read_locked, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKED, + write_locked, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building SET command.\n"); + return err; + } + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int opal_generic_locking_range_enable_disable(struct spdk_opal_dev *dev, + struct opal_session *sess, + uint8_t *uid, bool read_lock_enabled, bool write_lock_enabled) +{ + int err = 0; + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 23, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKENABLED, + read_lock_enabled, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKENABLED, + write_lock_enabled, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKED, + 0, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKED, + 0, + SPDK_OPAL_ENDNAME, + + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building locking range enable/disable command.\n"); + } + return err; +} + +static int +opal_setup_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + uint64_t range_start, uint64_t range_length, + bool read_lock_enabled, bool write_lock_enabled) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + if (locking_range == 0) { + err = opal_generic_locking_range_enable_disable(dev, sess, uid_locking_range, + read_lock_enabled, write_lock_enabled); + } else { + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 6, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_RANGESTART); + opal_add_token_u64(&err, sess, range_start); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_RANGELENGTH); + opal_add_token_u64(&err, sess, range_length); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_READLOCKENABLED); + opal_add_token_u64(&err, sess, read_lock_enabled); + opal_add_tokens(&err, sess, 3, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_WRITELOCKENABLED); + opal_add_token_u64(&err, sess, write_lock_enabled); + opal_add_tokens(&err, sess, 4, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + } + if (err) { + SPDK_ERRLOG("Error building Setup Locking range command.\n"); + return err; + + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_get_max_ranges_done(struct opal_session *sess) +{ + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + /* "MaxRanges" is token 4 of response */ + return opal_response_get_u16(&sess->parsed_resp, 4); +} + +static int +opal_get_max_ranges(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_LOCKING_INFO_TABLE], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_MAXRANGES, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_MAXRANGES, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building GET Lifecycle Status command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_max_ranges_done(sess); +} + +static int +opal_get_locking_range_info_done(struct opal_session *sess, + struct spdk_opal_locking_range_info *info) +{ + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + info->range_start = opal_response_get_u64(&sess->parsed_resp, 4); + info->range_length = opal_response_get_u64(&sess->parsed_resp, 8); + info->read_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 12); + info->write_lock_enabled = opal_response_get_u8(&sess->parsed_resp, 16); + info->read_locked = opal_response_get_u8(&sess->parsed_resp, 20); + info->write_locked = opal_response_get_u8(&sess->parsed_resp, 24); + + return 0; +} + +static int +opal_get_locking_range_info(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_locking_range locking_range_id) +{ + int err = 0; + int ret; + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + struct spdk_opal_locking_range_info *info; + + opal_build_locking_range(uid_locking_range, locking_range_id); + + assert(locking_range_id < SPDK_OPAL_MAX_LOCKING_RANGE); + info = &dev->locking_ranges[locking_range_id]; + memset(info, 0, sizeof(*info)); + info->locking_range_id = locking_range_id; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], OPAL_UID_LENGTH); + + + opal_add_tokens(&err, sess, 12, SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_RANGESTART, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_WRITELOCKED, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building get locking range info command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_locking_range_info_done(sess, info); +} + +static int +opal_enable_user(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_user user) +{ + int err = 0; + int ret; + uint8_t uid_user[OPAL_UID_LENGTH]; + + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 11, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_AUTH_ENABLE, + SPDK_OPAL_TRUE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error Building enable user command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_add_user_to_locking_range(struct spdk_opal_dev *dev, + struct opal_session *sess, + enum spdk_opal_user user, + enum spdk_opal_locking_range locking_range, + enum spdk_opal_lock_state l_state) +{ + int err = 0; + int ret; + uint8_t uid_user[OPAL_UID_LENGTH]; + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + + memcpy(uid_user, spdk_opal_uid[UID_USER1], OPAL_UID_LENGTH); + uid_user[7] = user; + + switch (l_state) { + case OPAL_READONLY: + memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_RDLOCKED], OPAL_UID_LENGTH); + break; + case OPAL_READWRITE: + memcpy(uid_locking_range, spdk_opal_uid[UID_LOCKINGRANGE_ACE_WRLOCKED], OPAL_UID_LENGTH); + break; + default: + SPDK_ERRLOG("locking state should only be OPAL_READONLY or OPAL_READWRITE\n"); + return -EINVAL; + } + + uid_locking_range[7] = locking_range; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[SET_METHOD], OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 8, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_VALUES, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_BOOLEAN_EXPR, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH / 2); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH / 2); + opal_add_token_bytestring(&err, sess, uid_user, OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_ENDNAME, SPDK_OPAL_STARTNAME); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_HALF_BOOLEAN_ACE], OPAL_UID_LENGTH / 2); + opal_add_tokens(&err, sess, 7, + SPDK_OPAL_TRUE, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building add user to locking range command\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_new_user_passwd(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_user user, + struct spdk_opal_key *opal_key) +{ + uint8_t uid_cpin[OPAL_UID_LENGTH]; + int ret; + + if (user == OPAL_ADMIN1) { + memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_ADMIN1], OPAL_UID_LENGTH); + } else { + memcpy(uid_cpin, spdk_opal_uid[UID_C_PIN_USER1], OPAL_UID_LENGTH); + uid_cpin[7] = user; + } + + ret = opal_build_generic_pw_cmd(sess, opal_key->key, opal_key->key_len, uid_cpin, dev); + if (ret != 0) { + SPDK_ERRLOG("Error building set password command\n"); + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_set_sid_cpin_pin(struct spdk_opal_dev *dev, struct opal_session *sess, char *new_passwd) +{ + uint8_t cpin_uid[OPAL_UID_LENGTH]; + struct spdk_opal_key opal_key = {}; + int ret; + + ret = opal_init_key(&opal_key, new_passwd); + if (ret != 0) { + return ret; + } + + memcpy(cpin_uid, spdk_opal_uid[UID_C_PIN_SID], OPAL_UID_LENGTH); + + if (opal_build_generic_pw_cmd(sess, opal_key.key, opal_key.key_len, cpin_uid, dev)) { + SPDK_ERRLOG("Error building Set SID cpin\n"); + return -ERANGE; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +int +spdk_opal_cmd_take_ownership(struct spdk_opal_dev *dev, char *new_passwd) +{ + int ret; + struct spdk_opal_key opal_key = {}; + struct opal_session *sess; + + assert(dev != NULL); + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ANYBODY, UID_ADMINSP, NULL, 0); + if (ret) { + SPDK_ERRLOG("start admin SP session error %d\n", ret); + goto end; + } + + ret = opal_get_msid_cpin_pin(dev, sess, &opal_key); + if (ret) { + SPDK_ERRLOG("get msid error %d\n", ret); + opal_end_session(dev, sess, dev->comid); + goto end; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + goto end; + } + + /* reuse the session structure */ + memset(sess, 0, sizeof(*sess)); + sess->dev = dev; + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start admin SP session error %d\n", ret); + goto end; + } + memset(&opal_key, 0, sizeof(struct spdk_opal_key)); + + ret = opal_set_sid_cpin_pin(dev, sess, new_passwd); + if (ret) { + SPDK_ERRLOG("set cpin error %d\n", ret); + opal_end_session(dev, sess, dev->comid); + goto end; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + +end: + free(sess); + return ret; +} + +struct spdk_opal_dev * + spdk_opal_dev_construct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct spdk_opal_dev *dev; + void *payload; + + dev = calloc(1, sizeof(*dev)); + if (!dev) { + SPDK_ERRLOG("Memory allocation failed\n"); + return NULL; + } + + dev->ctrlr = ctrlr; + + payload = calloc(1, IO_BUFFER_LENGTH); + if (!payload) { + free(dev); + return NULL; + } + + if (opal_discovery0(dev, payload, IO_BUFFER_LENGTH)) { + SPDK_INFOLOG(SPDK_LOG_OPAL, "Opal is not supported on this device\n"); + free(dev); + free(payload); + return NULL; + } + + free(payload); + return dev; +} + +static int +opal_build_revert_tper_cmd(struct spdk_opal_dev *dev, struct opal_session *sess) +{ + int err = 0; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, spdk_opal_uid[UID_ADMINSP], + OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[REVERT_METHOD], + OPAL_UID_LENGTH); + opal_add_token_u8(&err, sess, SPDK_OPAL_STARTLIST); + opal_add_token_u8(&err, sess, SPDK_OPAL_ENDLIST); + if (err) { + SPDK_ERRLOG("Error building REVERT TPER command.\n"); + return -ERANGE; + } + + return opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); +} + +static int +opal_gen_new_active_key(struct spdk_opal_dev *dev, struct opal_session *sess, + struct spdk_opal_key *active_key) +{ + uint8_t uid_data[OPAL_UID_LENGTH] = {0}; + int err = 0; + int length; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + if (active_key->key_len == 0) { + SPDK_ERRLOG("Error finding previous data to generate new active key\n"); + return -EINVAL; + } + + length = spdk_min(active_key->key_len, OPAL_UID_LENGTH); + memcpy(uid_data, active_key->key, length); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_data, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GENKEY_METHOD], + OPAL_UID_LENGTH); + + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building new key generation command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +static int +opal_get_active_key_done(struct opal_session *sess, struct spdk_opal_key *active_key) +{ + const char *key; + size_t str_len; + int error = 0; + + error = opal_parse_and_check_status(sess); + if (error) { + return error; + } + + str_len = opal_response_get_string(&sess->parsed_resp, 4, &key); + if (!key) { + SPDK_ERRLOG("Couldn't extract active key from response\n"); + return -EINVAL; + } + + active_key->key_len = str_len; + memcpy(active_key->key, key, active_key->key_len); + + SPDK_DEBUGLOG(SPDK_LOG_OPAL, "active key = %p\n", active_key->key); + return 0; +} + +static int +opal_get_active_key(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range, + struct spdk_opal_key *active_key) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[GET_METHOD], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 12, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTLIST, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_STARTCOLUMN, + SPDK_OPAL_ACTIVEKEY, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_STARTNAME, + SPDK_OPAL_ENDCOLUMN, + SPDK_OPAL_ACTIVEKEY, + SPDK_OPAL_ENDNAME, + SPDK_OPAL_ENDLIST, + SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building get active key command.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_get_active_key_done(sess, active_key); +} + +static int +opal_erase_locking_range(struct spdk_opal_dev *dev, struct opal_session *sess, + enum spdk_opal_locking_range locking_range) +{ + uint8_t uid_locking_range[OPAL_UID_LENGTH]; + int err = 0; + int ret; + + opal_clear_cmd(sess); + opal_set_comid(sess, dev->comid); + + opal_build_locking_range(uid_locking_range, locking_range); + + opal_add_token_u8(&err, sess, SPDK_OPAL_CALL); + opal_add_token_bytestring(&err, sess, uid_locking_range, OPAL_UID_LENGTH); + opal_add_token_bytestring(&err, sess, spdk_opal_method[ERASE_METHOD], + OPAL_UID_LENGTH); + opal_add_tokens(&err, sess, 2, SPDK_OPAL_STARTLIST, SPDK_OPAL_ENDLIST); + + if (err) { + SPDK_ERRLOG("Error building erase locking range.\n"); + return err; + } + + ret = opal_cmd_finalize(sess, sess->hsn, sess->tsn, true); + if (ret) { + return ret; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + return ret; + } + + return opal_parse_and_check_status(sess); +} + +int +spdk_opal_cmd_revert_tper(struct spdk_opal_dev *dev, const char *passwd) +{ + int ret; + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret) { + SPDK_ERRLOG("Init key failed\n"); + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_build_revert_tper_cmd(dev, sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Build revert tper command with error %d\n", ret); + goto end; + } + + ret = opal_send_recv(dev, sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret); + goto end; + } + + ret = opal_parse_and_check_status(sess); + if (ret) { + opal_end_session(dev, sess, dev->comid); + SPDK_ERRLOG("Error on reverting TPer with error %d\n", ret); + } + /* No opal_end_session() required here for successful case */ + +end: + free(sess); + return ret; +} + +int +spdk_opal_cmd_activate_locking_sp(struct spdk_opal_dev *dev, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_SID, UID_ADMINSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("Error on starting admin SP session with error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_locking_sp_lifecycle(dev, sess); + if (ret) { + SPDK_ERRLOG("Error on getting SP lifecycle with error %d\n", ret); + goto end; + } + + ret = opal_activate(dev, sess); + if (ret) { + SPDK_ERRLOG("Error on activation with error %d\n", ret); + } + +end: + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("Error on ending session with error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_lock_unlock(struct spdk_opal_dev *dev, enum spdk_opal_user user, + enum spdk_opal_lock_state flag, enum spdk_opal_locking_range locking_range, + const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_lock_unlock_range(dev, sess, locking_range, flag); + if (ret) { + SPDK_ERRLOG("lock unlock range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_setup_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user, + enum spdk_opal_locking_range locking_range_id, uint64_t range_start, + uint64_t range_length, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_setup_locking_range(dev, sess, locking_range_id, range_start, range_length, true, + true); + if (ret) { + SPDK_ERRLOG("setup locking range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_get_max_ranges(struct spdk_opal_dev *dev, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + if (dev->max_ranges) { + return dev->max_ranges; + } + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, OPAL_ADMIN1, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_max_ranges(dev, sess); + if (ret > 0) { + dev->max_ranges = ret; + } + + ret = opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + + return (ret == 0 ? dev->max_ranges : ret); +} + +int +spdk_opal_cmd_get_locking_range_info(struct spdk_opal_dev *dev, const char *passwd, + enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_get_locking_range_info(dev, sess, locking_range_id); + if (ret) { + SPDK_ERRLOG("get locking range info error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_enable_user(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start locking SP session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_enable_user(dev, sess, user_id); + if (ret) { + SPDK_ERRLOG("enable user error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_add_user_to_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, + enum spdk_opal_lock_state lock_flag, const char *passwd) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_generic_session(dev, sess, UID_ADMIN1, UID_LOCKINGSP, + opal_key.key, opal_key.key_len); + if (ret) { + SPDK_ERRLOG("start locking SP session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_add_user_to_locking_range(dev, sess, user_id, locking_range_id, lock_flag); + if (ret) { + SPDK_ERRLOG("add user to locking range error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_set_new_passwd(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + const char *new_passwd, const char *old_passwd, bool new_user) +{ + struct opal_session *sess; + struct spdk_opal_key old_key = {}; + struct spdk_opal_key new_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&old_key, old_passwd); + if (ret != 0) { + return ret; + } + + ret = opal_init_key(&new_key, new_passwd); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, new_user ? OPAL_ADMIN1 : user_id, + &old_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_new_user_passwd(dev, sess, user_id, &new_key); + if (ret) { + SPDK_ERRLOG("set new passwd error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, const char *password) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, password); + if (ret != 0) { + return ret; + } + + sess = opal_alloc_session(dev); + if (!sess) { + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(sess); + return ret; + } + + ret = opal_erase_locking_range(dev, sess, locking_range_id); + if (ret) { + SPDK_ERRLOG("get active key error %d\n", ret); + } + + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + + free(sess); + return ret; +} + +int +spdk_opal_cmd_secure_erase_locking_range(struct spdk_opal_dev *dev, enum spdk_opal_user user_id, + enum spdk_opal_locking_range locking_range_id, const char *password) +{ + struct opal_session *sess; + struct spdk_opal_key opal_key = {}; + struct spdk_opal_key *active_key; + int ret; + + assert(dev != NULL); + + ret = opal_init_key(&opal_key, password); + if (ret != 0) { + return ret; + } + + active_key = calloc(1, sizeof(*active_key)); + if (!active_key) { + return -ENOMEM; + } + + sess = opal_alloc_session(dev); + if (!sess) { + free(active_key); + return -ENOMEM; + } + + ret = opal_start_auth_session(dev, sess, user_id, &opal_key); + if (ret) { + SPDK_ERRLOG("start authenticate session error %d\n", ret); + free(active_key); + free(sess); + return ret; + } + + ret = opal_get_active_key(dev, sess, locking_range_id, active_key); + if (ret) { + SPDK_ERRLOG("get active key error %d\n", ret); + goto end; + } + + ret = opal_gen_new_active_key(dev, sess, active_key); + if (ret) { + SPDK_ERRLOG("generate new active key error %d\n", ret); + goto end; + } + memset(active_key, 0, sizeof(struct spdk_opal_key)); + +end: + ret += opal_end_session(dev, sess, dev->comid); + if (ret) { + SPDK_ERRLOG("end session error %d\n", ret); + } + free(active_key); + free(sess); + return ret; +} + +struct spdk_opal_d0_features_info * +spdk_opal_get_d0_features_info(struct spdk_opal_dev *dev) +{ + return &dev->feat_info; +} + +bool +spdk_opal_supported(struct spdk_opal_dev *dev) +{ + return false; +} + +struct spdk_opal_locking_range_info * +spdk_opal_get_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id) +{ + assert(id < SPDK_OPAL_MAX_LOCKING_RANGE); + return &dev->locking_ranges[id]; +} + +void +spdk_opal_free_locking_range_info(struct spdk_opal_dev *dev, enum spdk_opal_locking_range id) +{ + struct spdk_opal_locking_range_info *info; + + assert(id < SPDK_OPAL_MAX_LOCKING_RANGE); + info = &dev->locking_ranges[id]; + memset(info, 0, sizeof(*info)); +} + +/* Log component for opal submodule */ +SPDK_LOG_REGISTER_COMPONENT("opal", SPDK_LOG_OPAL) diff --git a/src/spdk/lib/nvme/nvme_opal_internal.h b/src/spdk/lib/nvme/nvme_opal_internal.h new file mode 100644 index 000000000..11815d435 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_opal_internal.h @@ -0,0 +1,272 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_OPAL_INTERNAL_H +#define SPDK_OPAL_INTERNAL_H + +#include "spdk/opal_spec.h" +#include "spdk/opal.h" +#include "spdk/scsi_spec.h" + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 +#define OPAL_KEY_MAX 256 +#define OPAL_UID_LENGTH 8 + +#define GENERIC_HOST_SESSION_NUM 0x69 + +#define OPAL_INVAL_PARAM 12 + +#define SPDK_DTAERROR_NO_METHOD_STATUS 0x89 + +enum opal_token_type { + OPAL_DTA_TOKENID_BYTESTRING = 0xE0, + OPAL_DTA_TOKENID_SINT = 0xE1, + OPAL_DTA_TOKENID_UINT = 0xE2, + OPAL_DTA_TOKENID_TOKEN = 0xE3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0, +}; + +enum opal_atom_width { + OPAL_WIDTH_TINY, /* 1 byte in length */ + OPAL_WIDTH_SHORT, /* a 1-byte header and contain up to 15 bytes of data */ + OPAL_WIDTH_MEDIUM, /* a 2-byte header and contain up to 2047 bytes of data */ + OPAL_WIDTH_LONG, /* a 4-byte header and which contain up to 16,777,215 bytes of data */ + OPAL_WIDTH_TOKEN +}; + +enum opal_uid_enum { + /* users */ + UID_SMUID, + UID_THISSP, + UID_ADMINSP, + UID_LOCKINGSP, + UID_ANYBODY, + UID_SID, + UID_ADMIN1, + UID_USER1, + UID_USER2, + + /* tables */ + UID_LOCKINGRANGE_GLOBAL, + UID_LOCKINGRANGE_ACE_RDLOCKED, + UID_LOCKINGRANGE_ACE_WRLOCKED, + UID_MBRCONTROL, + UID_MBR, + UID_AUTHORITY_TABLE, + UID_C_PIN_TABLE, + UID_LOCKING_INFO_TABLE, + UID_PSID, + + /* C_PIN_TABLE object ID's */ + UID_C_PIN_MSID, + UID_C_PIN_SID, + UID_C_PIN_ADMIN1, + UID_C_PIN_USER1, + + /* half UID's (only first 4 bytes used) */ + UID_HALF_AUTHORITY_OBJ_REF, + UID_HALF_BOOLEAN_ACE, +}; + +/* enum for indexing the spdk_opal_method array */ +enum opal_method_enum { + PROPERTIES_METHOD, + STARTSESSION_METHOD, + REVERT_METHOD, + ACTIVATE_METHOD, + NEXT_METHOD, + GETACL_METHOD, + GENKEY_METHOD, + REVERTSP_METHOD, + GET_METHOD, + SET_METHOD, + AUTHENTICATE_METHOD, + RANDOM_METHOD, + ERASE_METHOD, +}; + +struct spdk_opal_key { + uint8_t key_len; + uint8_t key[OPAL_KEY_MAX]; +}; + +const uint8_t spdk_opal_uid[][OPAL_UID_LENGTH] = { + /* users */ + [UID_SMUID] = /* Session Manager UID */ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [UID_THISSP] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [UID_ADMINSP] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [UID_LOCKINGSP] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [UID_ANYBODY] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [UID_SID] = /* Security Identifier UID */ + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [UID_ADMIN1] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [UID_USER1] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [UID_USER2] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + + /* tables */ + [UID_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [UID_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [UID_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [UID_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [UID_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [UID_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [UID_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [UID_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [UID_PSID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + + /* C_PIN_TABLE object ID's */ + [UID_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [UID_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [UID_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + [UID_C_PIN_USER1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x03, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + [UID_HALF_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [UID_HALF_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, +}; + +/* + * TCG Storage SSC Methods. + */ +const uint8_t spdk_opal_method[][OPAL_UID_LENGTH] = { + [PROPERTIES_METHOD] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [STARTSESSION_METHOD] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [REVERT_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [ACTIVATE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [NEXT_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [GETACL_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [GENKEY_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [REVERTSP_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [GET_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [SET_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [AUTHENTICATE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [RANDOM_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [ERASE_METHOD] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +/* + * Response token + */ +struct spdk_opal_resp_token { + const uint8_t *pos; + uint8_t _padding[7]; + union { + uint64_t unsigned_num; + int64_t signed_num; + } stored; + size_t len; /* header + data */ + enum opal_token_type type; + enum opal_atom_width width; +}; + +struct spdk_opal_resp_parsed { + int num; + struct spdk_opal_resp_token resp_tokens[MAX_TOKS]; +}; + +/* header of a response */ +struct spdk_opal_header { + struct spdk_opal_compacket com_packet; + struct spdk_opal_packet packet; + struct spdk_opal_data_subpacket sub_packet; +}; + +struct opal_session; +struct spdk_opal_dev; + +typedef void (*opal_sess_cb)(struct opal_session *sess, int status, void *ctx); + +struct opal_session { + uint32_t hsn; + uint32_t tsn; + size_t cmd_pos; + uint8_t cmd[IO_BUFFER_LENGTH]; + uint8_t resp[IO_BUFFER_LENGTH]; + struct spdk_opal_resp_parsed parsed_resp; + + opal_sess_cb sess_cb; + void *cb_arg; + bool done; + int status; + struct spdk_opal_dev *dev; +}; + +struct spdk_opal_dev { + struct spdk_nvme_ctrlr *ctrlr; + + uint16_t comid; + + struct spdk_opal_d0_features_info feat_info; + + uint8_t max_ranges; /* max locking range number */ + struct spdk_opal_locking_range_info locking_ranges[SPDK_OPAL_MAX_LOCKING_RANGE]; +}; + +#endif diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c new file mode 100644 index 000000000..132e34cdc --- /dev/null +++ b/src/spdk/lib/nvme/nvme_pcie.c @@ -0,0 +1,2604 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2017, IBM Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over PCIe transport + */ + +#include "spdk/stdinc.h" +#include "spdk/env.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "nvme_internal.h" +#include "nvme_uevent.h" + +/* + * Number of completion queue entries to process before ringing the + * completion queue doorbell. + */ +#define NVME_MIN_COMPLETIONS (1) +#define NVME_MAX_COMPLETIONS (128) + +/* + * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL + * segment. + */ +#define NVME_MAX_SGL_DESCRIPTORS (250) + +#define NVME_MAX_PRP_LIST_ENTRIES (503) + +struct nvme_pcie_enum_ctx { + struct spdk_nvme_probe_ctx *probe_ctx; + struct spdk_pci_addr pci_addr; + bool has_pci_addr; +}; + +/* PCIe transport extensions for spdk_nvme_ctrlr */ +struct nvme_pcie_ctrlr { + struct spdk_nvme_ctrlr ctrlr; + + /** NVMe MMIO register space */ + volatile struct spdk_nvme_registers *regs; + + /** NVMe MMIO register size */ + uint64_t regs_size; + + struct { + /* BAR mapping address which contains controller memory buffer */ + void *bar_va; + + /* BAR physical address which contains controller memory buffer */ + uint64_t bar_pa; + + /* Controller memory buffer size in Bytes */ + uint64_t size; + + /* Current offset of controller memory buffer, relative to start of BAR virt addr */ + uint64_t current_offset; + + void *mem_register_addr; + size_t mem_register_size; + } cmb; + + /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */ + uint32_t doorbell_stride_u32; + + /* Opaque handle to associated PCI device. */ + struct spdk_pci_device *devhandle; + + /* Flag to indicate the MMIO register has been remapped */ + bool is_remapped; +}; + +struct nvme_tracker { + TAILQ_ENTRY(nvme_tracker) tq_list; + + struct nvme_request *req; + uint16_t cid; + + uint16_t rsvd0; + uint32_t rsvd1; + + spdk_nvme_cmd_cb cb_fn; + void *cb_arg; + + uint64_t prp_sgl_bus_addr; + + /* Don't move, metadata SGL is always contiguous with Data Block SGL */ + struct spdk_nvme_sgl_descriptor meta_sgl; + union { + uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES]; + struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS]; + } u; +}; +/* + * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary + * and so that there is no padding required to meet alignment requirements. + */ +SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K"); +SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned"); +SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, meta_sgl) & 7) == 0, "SGL must be Qword aligned"); + +struct nvme_pcie_poll_group { + struct spdk_nvme_transport_poll_group group; +}; + +/* PCIe transport extensions for spdk_nvme_qpair */ +struct nvme_pcie_qpair { + /* Submission queue tail doorbell */ + volatile uint32_t *sq_tdbl; + + /* Completion queue head doorbell */ + volatile uint32_t *cq_hdbl; + + /* Submission queue */ + struct spdk_nvme_cmd *cmd; + + /* Completion queue */ + struct spdk_nvme_cpl *cpl; + + TAILQ_HEAD(, nvme_tracker) free_tr; + TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr; + + /* Array of trackers indexed by command ID. */ + struct nvme_tracker *tr; + + uint16_t num_entries; + + uint8_t retry_count; + + uint16_t max_completions_cap; + + uint16_t last_sq_tail; + uint16_t sq_tail; + uint16_t cq_head; + uint16_t sq_head; + + struct { + uint8_t phase : 1; + uint8_t delay_cmd_submit : 1; + uint8_t has_shadow_doorbell : 1; + } flags; + + /* + * Base qpair structure. + * This is located after the hot data in this structure so that the important parts of + * nvme_pcie_qpair are in the same cache line. + */ + struct spdk_nvme_qpair qpair; + + struct { + /* Submission queue shadow tail doorbell */ + volatile uint32_t *sq_tdbl; + + /* Completion queue shadow head doorbell */ + volatile uint32_t *cq_hdbl; + + /* Submission queue event index */ + volatile uint32_t *sq_eventidx; + + /* Completion queue event index */ + volatile uint32_t *cq_eventidx; + } shadow_doorbell; + + /* + * Fields below this point should not be touched on the normal I/O path. + */ + + bool sq_in_cmb; + + uint64_t cmd_bus_addr; + uint64_t cpl_bus_addr; + + struct spdk_nvme_cmd *sq_vaddr; + struct spdk_nvme_cpl *cq_vaddr; +}; + +static int nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, + struct spdk_pci_addr *pci_addr); +static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, + const struct spdk_nvme_io_qpair_opts *opts); +static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair); + +__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL; +static uint16_t g_signal_lock; +static bool g_sigset = false; + +static void +nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx) +{ + void *map_address; + uint16_t flag = 0; + + if (!__atomic_compare_exchange_n(&g_signal_lock, &flag, 1, false, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "request g_signal_lock failed\n"); + return; + } + + assert(g_thread_mmio_ctrlr != NULL); + + if (!g_thread_mmio_ctrlr->is_remapped) { + map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (map_address == MAP_FAILED) { + SPDK_ERRLOG("mmap failed\n"); + __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); + return; + } + memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers)); + g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address; + g_thread_mmio_ctrlr->is_remapped = true; + } + __atomic_store_n(&g_signal_lock, 0, __ATOMIC_RELEASE); +} + +static void +nvme_pcie_ctrlr_setup_signal(void) +{ + struct sigaction sa; + + sa.sa_sigaction = nvme_sigbus_fault_sighandler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_SIGINFO; + sigaction(SIGBUS, &sa, NULL); +} + +static inline struct nvme_pcie_ctrlr * +nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr); +} + +static int +_nvme_pcie_hotplug_monitor(struct spdk_nvme_probe_ctx *probe_ctx) +{ + struct spdk_nvme_ctrlr *ctrlr, *tmp; + struct spdk_uevent event; + struct spdk_pci_addr pci_addr; + + if (g_spdk_nvme_driver->hotplug_fd < 0) { + return 0; + } + + while (nvme_get_uevent(g_spdk_nvme_driver->hotplug_fd, &event) > 0) { + if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO || + event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) { + if (event.action == SPDK_NVME_UEVENT_ADD) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n", + event.traddr); + if (spdk_process_is_primary()) { + if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) { + nvme_pcie_ctrlr_attach(probe_ctx, &pci_addr); + } + } + } else if (event.action == SPDK_NVME_UEVENT_REMOVE) { + struct spdk_nvme_transport_id trid; + + memset(&trid, 0, sizeof(trid)); + spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); + snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr); + + ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); + if (ctrlr == NULL) { + return 0; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n", + event.traddr); + + nvme_ctrlr_fail(ctrlr, true); + + /* get the user app to clean up and stop I/O */ + if (ctrlr->remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + } + + /* Initiate removal of physically hotremoved PCI controllers. Even after + * they're hotremoved from the system, SPDK might still report them via RPC. + */ + TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) { + bool do_remove = false; + struct nvme_pcie_ctrlr *pctrlr; + + if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + continue; + } + + pctrlr = nvme_pcie_ctrlr(ctrlr); + if (spdk_pci_device_is_removed(pctrlr->devhandle)) { + do_remove = true; + } + + if (do_remove) { + nvme_ctrlr_fail(ctrlr, true); + if (ctrlr->remove_cb) { + nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock); + ctrlr->remove_cb(probe_ctx->cb_ctx, ctrlr); + nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock); + } + } + } + return 0; +} + +static inline struct nvme_pcie_qpair * +nvme_pcie_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE); + return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair); +} + +static volatile void * +nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + return (volatile void *)((uintptr_t)pctrlr->regs + offset); +} + +static int +nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +static int +nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value); + g_thread_mmio_ctrlr = NULL; + return 0; +} + +static int +nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 4); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + assert(offset <= sizeof(struct spdk_nvme_registers) - 8); + assert(value != NULL); + g_thread_mmio_ctrlr = pctrlr; + *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset)); + g_thread_mmio_ctrlr = NULL; + if (~(*value) == 0) { + return -1; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq), + value); +} + +static int +nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value) +{ + return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq), + value); +} + +static int +nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa) +{ + return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw), + aqa->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw), + &cmbloc->raw); +} + +static int +nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz) +{ + return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw), + &cmbsz->raw); +} + +static uint32_t +nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* + * For commands requiring more than 2 PRP entries, one PRP will be + * embedded in the command (prp1), and the rest of the PRP entries + * will be in a list pointed to by the command (prp2). This means + * that real max number of PRP entries we support is 506+1, which + * results in a max xfer size of 506*ctrlr->page_size. + */ + return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size; +} + +static uint16_t +nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + return NVME_MAX_SGL_DESCRIPTORS; +} + +static void +nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr = NULL; + uint32_t bir; + union spdk_nvme_cmbsz_register cmbsz; + union spdk_nvme_cmbloc_register cmbloc; + uint64_t size, unit_size, offset, bar_size = 0, bar_phys_addr = 0; + + if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || + nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get registers failed\n"); + goto exit; + } + + if (!cmbsz.bits.sz) { + goto exit; + } + + bir = cmbloc.bits.bir; + /* Values 0 2 3 4 5 are valid for BAR */ + if (bir > 5 || bir == 1) { + goto exit; + } + + /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */ + unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu); + /* controller memory buffer size in Bytes */ + size = unit_size * cmbsz.bits.sz; + /* controller memory buffer offset from BAR in Bytes */ + offset = unit_size * cmbloc.bits.ofst; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr, + &bar_phys_addr, &bar_size); + if ((rc != 0) || addr == NULL) { + goto exit; + } + + if (offset > bar_size) { + goto exit; + } + + if (size > bar_size - offset) { + goto exit; + } + + pctrlr->cmb.bar_va = addr; + pctrlr->cmb.bar_pa = bar_phys_addr; + pctrlr->cmb.size = size; + pctrlr->cmb.current_offset = offset; + + if (!cmbsz.bits.sqs) { + pctrlr->ctrlr.opts.use_cmb_sqs = false; + } + + return; +exit: + pctrlr->ctrlr.opts.use_cmb_sqs = false; + return; +} + +static int +nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + union spdk_nvme_cmbloc_register cmbloc; + void *addr = pctrlr->cmb.bar_va; + + if (addr) { + if (pctrlr->cmb.mem_register_addr) { + spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); + } + + if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get_cmbloc() failed\n"); + return -EIO; + } + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + + if (pctrlr->cmb.bar_va == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); + return -ENOTSUP; + } + + if (ctrlr->opts.use_cmb_sqs) { + SPDK_ERRLOG("CMB is already in use for submission queues.\n"); + return -ENOTSUP; + } + + return 0; +} + +static void * +nvme_pcie_ctrlr_map_io_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + union spdk_nvme_cmbsz_register cmbsz; + union spdk_nvme_cmbloc_register cmbloc; + uint64_t mem_register_start, mem_register_end; + int rc; + + if (pctrlr->cmb.mem_register_addr != NULL) { + *size = pctrlr->cmb.mem_register_size; + return pctrlr->cmb.mem_register_addr; + } + + *size = 0; + + if (pctrlr->cmb.bar_va == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n"); + return NULL; + } + + if (ctrlr->opts.use_cmb_sqs) { + SPDK_ERRLOG("CMB is already in use for submission queues.\n"); + return NULL; + } + + if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) || + nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) { + SPDK_ERRLOG("get registers failed\n"); + return NULL; + } + + /* If only SQS is supported */ + if (!(cmbsz.bits.wds || cmbsz.bits.rds)) { + return NULL; + } + + /* If CMB is less than 4MiB in size then abort CMB mapping */ + if (pctrlr->cmb.size < (1ULL << 22)) { + return NULL; + } + + mem_register_start = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + + VALUE_2MB - 1); + mem_register_end = _2MB_PAGE((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset + + pctrlr->cmb.size); + pctrlr->cmb.mem_register_addr = (void *)mem_register_start; + pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; + + rc = spdk_mem_register((void *)mem_register_start, mem_register_end - mem_register_start); + if (rc) { + SPDK_ERRLOG("spdk_mem_register() failed\n"); + return NULL; + } + + pctrlr->cmb.mem_register_addr = (void *)mem_register_start; + pctrlr->cmb.mem_register_size = mem_register_end - mem_register_start; + + *size = pctrlr->cmb.mem_register_size; + return pctrlr->cmb.mem_register_addr; +} + +static int +nvme_pcie_ctrlr_unmap_io_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + int rc; + + if (pctrlr->cmb.mem_register_addr == NULL) { + return 0; + } + + rc = spdk_mem_unregister(pctrlr->cmb.mem_register_addr, pctrlr->cmb.mem_register_size); + + if (rc == 0) { + pctrlr->cmb.mem_register_addr = NULL; + pctrlr->cmb.mem_register_size = 0; + } + + return rc; +} + +static int +nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc; + void *addr = NULL; + uint64_t phys_addr = 0, size = 0; + + rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr, + &phys_addr, &size); + + if ((addr == NULL) || (rc != 0)) { + SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n", + rc, addr); + return -1; + } + + pctrlr->regs = (volatile struct spdk_nvme_registers *)addr; + pctrlr->regs_size = size; + nvme_pcie_ctrlr_map_cmb(pctrlr); + + return 0; +} + +static int +nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr) +{ + int rc = 0; + void *addr = (void *)pctrlr->regs; + + if (pctrlr->ctrlr.is_removed) { + return rc; + } + + rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr); + if (rc != 0) { + SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc); + return -1; + } + + if (addr) { + /* NOTE: addr may have been remapped here. We're relying on DPDK to call + * munmap internally. + */ + rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr); + } + return rc; +} + +static int +nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t num_entries) +{ + struct nvme_pcie_qpair *pqpair; + int rc; + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return -ENOMEM; + } + + pqpair->num_entries = num_entries; + pqpair->flags.delay_cmd_submit = 0; + + ctrlr->adminq = &pqpair->qpair; + + rc = nvme_qpair_init(ctrlr->adminq, + 0, /* qpair ID */ + ctrlr, + SPDK_NVME_QPRIO_URGENT, + num_entries); + if (rc != 0) { + return rc; + } + + return nvme_pcie_qpair_construct(ctrlr->adminq, NULL); +} + +/* This function must only be called while holding g_spdk_nvme_driver->lock */ +static int +pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev) +{ + struct spdk_nvme_transport_id trid = {}; + struct nvme_pcie_enum_ctx *enum_ctx = ctx; + struct spdk_nvme_ctrlr *ctrlr; + struct spdk_pci_addr pci_addr; + + pci_addr = spdk_pci_device_get_addr(pci_dev); + + spdk_nvme_trid_populate_transport(&trid, SPDK_NVME_TRANSPORT_PCIE); + spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr); + + ctrlr = nvme_get_ctrlr_by_trid_unsafe(&trid); + if (!spdk_process_is_primary()) { + if (!ctrlr) { + SPDK_ERRLOG("Controller must be constructed in the primary process first.\n"); + return -1; + } + + return nvme_ctrlr_add_process(ctrlr, pci_dev); + } + + /* check whether user passes the pci_addr */ + if (enum_ctx->has_pci_addr && + (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) { + return 1; + } + + return nvme_ctrlr_probe(&trid, enum_ctx->probe_ctx, pci_dev); +} + +static int +nvme_pcie_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + struct nvme_pcie_enum_ctx enum_ctx = {}; + + enum_ctx.probe_ctx = probe_ctx; + + if (strlen(probe_ctx->trid.traddr) != 0) { + if (spdk_pci_addr_parse(&enum_ctx.pci_addr, probe_ctx->trid.traddr)) { + return -1; + } + enum_ctx.has_pci_addr = true; + } + + /* Only the primary process can monitor hotplug. */ + if (spdk_process_is_primary()) { + _nvme_pcie_hotplug_monitor(probe_ctx); + } + + if (enum_ctx.has_pci_addr == false) { + return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), + pcie_nvme_enum_cb, &enum_ctx); + } else { + return spdk_pci_device_attach(spdk_pci_nvme_get_driver(), + pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr); + } +} + +static int +nvme_pcie_ctrlr_attach(struct spdk_nvme_probe_ctx *probe_ctx, struct spdk_pci_addr *pci_addr) +{ + struct nvme_pcie_enum_ctx enum_ctx; + + enum_ctx.probe_ctx = probe_ctx; + enum_ctx.has_pci_addr = true; + enum_ctx.pci_addr = *pci_addr; + + return spdk_pci_enumerate(spdk_pci_nvme_get_driver(), pcie_nvme_enum_cb, &enum_ctx); +} + +static struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct spdk_pci_device *pci_dev = devhandle; + struct nvme_pcie_ctrlr *pctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + uint16_t cmd_reg; + int rc; + struct spdk_pci_id pci_id; + + rc = spdk_pci_device_claim(pci_dev); + if (rc < 0) { + SPDK_ERRLOG("could not claim device %s (%s)\n", + trid->traddr, spdk_strerror(-rc)); + return NULL; + } + + pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pctrlr == NULL) { + spdk_pci_device_unclaim(pci_dev); + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + pctrlr->is_remapped = false; + pctrlr->ctrlr.is_removed = false; + pctrlr->devhandle = devhandle; + pctrlr->ctrlr.opts = *opts; + pctrlr->ctrlr.trid = *trid; + + rc = nvme_ctrlr_construct(&pctrlr->ctrlr); + if (rc != 0) { + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + rc = nvme_pcie_ctrlr_allocate_bars(pctrlr); + if (rc != 0) { + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + /* Enable PCI busmaster and disable INTx */ + spdk_pci_device_cfg_read16(pci_dev, &cmd_reg, 4); + cmd_reg |= 0x404; + spdk_pci_device_cfg_write16(pci_dev, cmd_reg, 4); + + if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + spdk_pci_device_unclaim(pci_dev); + spdk_free(pctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs); + + /* Doorbell stride is 2 ^ (dstrd + 2), + * but we want multiples of 4, so drop the + 2 */ + pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd; + + pci_id = spdk_pci_device_get_id(pci_dev); + pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id); + + rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr, pctrlr->ctrlr.opts.admin_queue_size); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + /* Construct the primary process properties */ + rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev); + if (rc != 0) { + nvme_ctrlr_destruct(&pctrlr->ctrlr); + return NULL; + } + + if (g_sigset != true) { + nvme_pcie_ctrlr_setup_signal(); + g_sigset = true; + } + + return &pctrlr->ctrlr; +} + +static int +nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq); + union spdk_nvme_aqa_register aqa; + + if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) { + SPDK_ERRLOG("set_asq() failed\n"); + return -EIO; + } + + if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) { + SPDK_ERRLOG("set_acq() failed\n"); + return -EIO; + } + + aqa.raw = 0; + /* acqs and asqs are 0-based. */ + aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1; + + if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) { + SPDK_ERRLOG("set_aqa() failed\n"); + return -EIO; + } + + return 0; +} + +static int +nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr); + + if (ctrlr->adminq) { + nvme_pcie_qpair_destroy(ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_ctrlr_free_processes(ctrlr); + + nvme_pcie_ctrlr_free_bars(pctrlr); + + if (devhandle) { + spdk_pci_device_unclaim(devhandle); + spdk_pci_device_detach(devhandle); + } + + spdk_free(pctrlr); + + return 0; +} + +static void +nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr) +{ + tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp); + tr->cid = cid; + tr->req = NULL; +} + +static int +nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + uint32_t i; + + /* all head/tail vals are set to 0 */ + pqpair->last_sq_tail = pqpair->sq_tail = pqpair->sq_head = pqpair->cq_head = 0; + + /* + * First time through the completion queue, HW will set phase + * bit on completions to 1. So set this to 1 here, indicating + * we're looking for a 1 to know which entries have completed. + * we'll toggle the bit each time when the completion queue + * rolls over. + */ + pqpair->flags.phase = 1; + for (i = 0; i < pqpair->num_entries; i++) { + pqpair->cpl[i].status.p = 0; + } + + return 0; +} + +static void * +nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t size, uint64_t alignment, + uint64_t *phys_addr) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + uintptr_t addr; + + if (pctrlr->cmb.mem_register_addr != NULL) { + /* BAR is mapped for data */ + return NULL; + } + + addr = (uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.current_offset; + addr = (addr + (alignment - 1)) & ~(alignment - 1); + + /* CMB may only consume part of the BAR, calculate accordingly */ + if (addr + size > ((uintptr_t)pctrlr->cmb.bar_va + pctrlr->cmb.size)) { + SPDK_ERRLOG("Tried to allocate past valid CMB range!\n"); + return NULL; + } + *phys_addr = pctrlr->cmb.bar_pa + addr - (uintptr_t)pctrlr->cmb.bar_va; + + pctrlr->cmb.current_offset = (addr + size) - (uintptr_t)pctrlr->cmb.bar_va; + + return (void *)addr; +} + +static int +nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + uint16_t i; + volatile uint32_t *doorbell_base; + uint16_t num_trackers; + size_t page_align = sysconf(_SC_PAGESIZE); + size_t queue_align, queue_len; + uint32_t flags = SPDK_MALLOC_DMA; + uint64_t sq_paddr = 0; + uint64_t cq_paddr = 0; + + if (opts) { + pqpair->sq_vaddr = opts->sq.vaddr; + pqpair->cq_vaddr = opts->cq.vaddr; + sq_paddr = opts->sq.paddr; + cq_paddr = opts->cq.paddr; + } + + pqpair->retry_count = ctrlr->opts.transport_retry_count; + + /* + * Limit the maximum number of completions to return per call to prevent wraparound, + * and calculate how many trackers can be submitted at once without overflowing the + * completion queue. + */ + pqpair->max_completions_cap = pqpair->num_entries / 4; + pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS); + pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS); + num_trackers = pqpair->num_entries - pqpair->max_completions_cap; + + SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n", + pqpair->max_completions_cap, num_trackers); + + assert(num_trackers != 0); + + pqpair->sq_in_cmb = false; + + if (nvme_qpair_is_admin_queue(&pqpair->qpair)) { + flags |= SPDK_MALLOC_SHARE; + } + + /* cmd and cpl rings must be aligned on page size boundaries. */ + if (ctrlr->opts.use_cmb_sqs) { + pqpair->cmd = nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd), + page_align, &pqpair->cmd_bus_addr); + if (pqpair->cmd != NULL) { + pqpair->sq_in_cmb = true; + } + } + + if (pqpair->sq_in_cmb == false) { + if (pqpair->sq_vaddr) { + pqpair->cmd = pqpair->sq_vaddr; + } else { + /* To ensure physical address contiguity we make each ring occupy + * a single hugepage only. See MAX_IO_QUEUE_ENTRIES. + */ + queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cmd); + queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); + pqpair->cmd = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cmd == NULL) { + SPDK_ERRLOG("alloc qpair_cmd failed\n"); + return -ENOMEM; + } + } + if (sq_paddr) { + assert(pqpair->sq_vaddr != NULL); + pqpair->cmd_bus_addr = sq_paddr; + } else { + pqpair->cmd_bus_addr = spdk_vtophys(pqpair->cmd, NULL); + if (pqpair->cmd_bus_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("spdk_vtophys(pqpair->cmd) failed\n"); + return -EFAULT; + } + } + } + + if (pqpair->cq_vaddr) { + pqpair->cpl = pqpair->cq_vaddr; + } else { + queue_len = pqpair->num_entries * sizeof(struct spdk_nvme_cpl); + queue_align = spdk_max(spdk_align32pow2(queue_len), page_align); + pqpair->cpl = spdk_zmalloc(queue_len, queue_align, NULL, SPDK_ENV_SOCKET_ID_ANY, flags); + if (pqpair->cpl == NULL) { + SPDK_ERRLOG("alloc qpair_cpl failed\n"); + return -ENOMEM; + } + } + if (cq_paddr) { + assert(pqpair->cq_vaddr != NULL); + pqpair->cpl_bus_addr = cq_paddr; + } else { + pqpair->cpl_bus_addr = spdk_vtophys(pqpair->cpl, NULL); + if (pqpair->cpl_bus_addr == SPDK_VTOPHYS_ERROR) { + SPDK_ERRLOG("spdk_vtophys(pqpair->cpl) failed\n"); + return -EFAULT; + } + } + + doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl; + pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32; + pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32; + + /* + * Reserve space for all of the trackers in a single allocation. + * struct nvme_tracker must be padded so that its size is already a power of 2. + * This ensures the PRP list embedded in the nvme_tracker object will not span a + * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing. + */ + pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair->tr == NULL) { + SPDK_ERRLOG("nvme_tr failed\n"); + return -ENOMEM; + } + + TAILQ_INIT(&pqpair->free_tr); + TAILQ_INIT(&pqpair->outstanding_tr); + + for (i = 0; i < num_trackers; i++) { + tr = &pqpair->tr[i]; + nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr, NULL)); + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + } + + nvme_pcie_qpair_reset(qpair); + + return 0; +} + +/* Used when dst points to MMIO (i.e. CMB) in a virtual machine - in these cases we must + * not use wide instructions because QEMU will not emulate such instructions to MMIO space. + * So this function ensures we only copy 8 bytes at a time. + */ +static inline void +nvme_pcie_copy_command_mmio(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + uint64_t *dst64 = (uint64_t *)dst; + const uint64_t *src64 = (const uint64_t *)src; + uint32_t i; + + for (i = 0; i < sizeof(*dst) / 8; i++) { + dst64[i] = src64[i]; + } +} + +static inline void +nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src) +{ + /* dst and src are known to be non-overlapping and 64-byte aligned. */ +#if defined(__SSE2__) + __m128i *d128 = (__m128i *)dst; + const __m128i *s128 = (const __m128i *)src; + + _mm_stream_si128(&d128[0], _mm_load_si128(&s128[0])); + _mm_stream_si128(&d128[1], _mm_load_si128(&s128[1])); + _mm_stream_si128(&d128[2], _mm_load_si128(&s128[2])); + _mm_stream_si128(&d128[3], _mm_load_si128(&s128[3])); +#else + *dst = *src; +#endif +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct spdk_nvme_cpl *cpl) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *active_req = req; + struct spdk_nvme_ctrlr_process *active_proc; + + /* + * The admin request is from another process. Move to the per + * process list for that process to handle it later. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + assert(active_req->pid != getpid()); + + active_proc = nvme_ctrlr_get_process(ctrlr, active_req->pid); + if (active_proc) { + /* Save the original completion information */ + memcpy(&active_req->cpl, cpl, sizeof(*cpl)); + STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq); + } else { + SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n", + active_req->pid); + + nvme_free_request(active_req); + } +} + +/** + * Note: the ctrlr_lock must be held when calling this function. + */ +static void +nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_request *req, *tmp_req; + pid_t pid = getpid(); + struct spdk_nvme_ctrlr_process *proc; + + /* + * Check whether there is any pending admin request from + * other active processes. + */ + assert(nvme_qpair_is_admin_queue(qpair)); + + proc = nvme_ctrlr_get_current_process(ctrlr); + if (!proc) { + SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid); + assert(proc); + return; + } + + STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) { + STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq); + + assert(req->pid == pid); + + nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &req->cpl); + nvme_free_request(req); + } +} + +static inline int +nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) +{ + return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old); +} + +static bool +nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value, + volatile uint32_t *shadow_db, + volatile uint32_t *eventidx) +{ + uint16_t old; + + if (!shadow_db) { + return true; + } + + old = *shadow_db; + *shadow_db = value; + + /* + * Ensure that the doorbell is updated before reading the EventIdx from + * memory + */ + spdk_mb(); + + if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) { + return false; + } + + return true; +} + +static inline void +nvme_pcie_qpair_ring_sq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + bool need_mmio = true; + + if (qpair->first_fused_submitted) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 0; + return; + } + + if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { + need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, + pqpair->sq_tail, + pqpair->shadow_doorbell.sq_tdbl, + pqpair->shadow_doorbell.sq_eventidx); + } + + if (spdk_likely(need_mmio)) { + spdk_wmb(); + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail); + g_thread_mmio_ctrlr = NULL; + } +} + +static inline void +nvme_pcie_qpair_ring_cq_doorbell(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr); + bool need_mmio = true; + + if (spdk_unlikely(pqpair->flags.has_shadow_doorbell)) { + need_mmio = nvme_pcie_qpair_update_mmio_required(qpair, + pqpair->cq_head, + pqpair->shadow_doorbell.cq_hdbl, + pqpair->shadow_doorbell.cq_eventidx); + } + + if (spdk_likely(need_mmio)) { + g_thread_mmio_ctrlr = pctrlr; + spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head); + g_thread_mmio_ctrlr = NULL; + } +} + +static void +nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + struct nvme_request *req; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + + req = tr->req; + assert(req != NULL); + + if (req->cmd.fuse == SPDK_NVME_IO_FLAGS_FUSE_FIRST) { + /* This is first cmd of two fused commands - don't ring doorbell */ + qpair->first_fused_submitted = 1; + } + + /* Don't use wide instructions to copy NVMe command, this is limited by QEMU + * virtual NVMe controller, the maximum access width is 8 Bytes for one time. + */ + if (spdk_unlikely((ctrlr->quirks & NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH) && pqpair->sq_in_cmb)) { + nvme_pcie_copy_command_mmio(&pqpair->cmd[pqpair->sq_tail], &req->cmd); + } else { + /* Copy the command from the tracker to the submission queue. */ + nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd); + } + + if (spdk_unlikely(++pqpair->sq_tail == pqpair->num_entries)) { + pqpair->sq_tail = 0; + } + + if (spdk_unlikely(pqpair->sq_tail == pqpair->sq_head)) { + SPDK_ERRLOG("sq_tail is passing sq_head!\n"); + } + + if (!pqpair->flags.delay_cmd_submit) { + nvme_pcie_qpair_ring_sq_doorbell(qpair); + } +} + +static void +nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + struct spdk_nvme_cpl *cpl, bool print_on_error) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_request *req; + bool retry, error; + bool req_from_current_proc = true; + + req = tr->req; + + assert(req != NULL); + + error = spdk_nvme_cpl_is_error(cpl); + retry = error && nvme_completion_is_retry(cpl) && + req->retries < pqpair->retry_count; + + if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { + spdk_nvme_qpair_print_command(qpair, &req->cmd); + spdk_nvme_qpair_print_completion(qpair, cpl); + } + + assert(cpl->cid == req->cmd.cid); + + if (retry) { + req->retries++; + nvme_pcie_qpair_submit_tracker(qpair, tr); + } else { + TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list); + + /* Only check admin requests from different processes. */ + if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) { + req_from_current_proc = false; + nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl); + } else { + nvme_complete_request(tr->cb_fn, tr->cb_arg, qpair, req, cpl); + } + + if (req_from_current_proc == true) { + nvme_qpair_free_request(qpair, req); + } + + tr->req = NULL; + + TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list); + } +} + +static void +nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair, + struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, + bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.cid = tr->cid; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); +} + +static void +nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *temp, *last; + + last = TAILQ_LAST(&pqpair->outstanding_tr, nvme_outstanding_tr_head); + + /* Abort previously submitted (outstanding) trs */ + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) { + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting outstanding command\n"); + } + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + + if (tr == last) { + break; + } + } +} + +static int +nvme_pcie_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + rc = iter_fn(tr->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + while (tr != NULL) { + assert(tr->req != NULL); + if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, + SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0, + false); + tr = TAILQ_FIRST(&pqpair->outstanding_tr); + } else { + tr = TAILQ_NEXT(tr, tq_list); + } + } +} + +static void +nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + nvme_pcie_admin_qpair_abort_aers(qpair); +} + +static int +nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_pcie_admin_qpair_destroy(qpair); + } + /* + * We check sq_vaddr and cq_vaddr to see if the user specified the memory + * buffers when creating the I/O queue. + * If the user specified them, we cannot free that memory. + * Nor do we free it if it's in the CMB. + */ + if (!pqpair->sq_vaddr && pqpair->cmd && !pqpair->sq_in_cmb) { + spdk_free(pqpair->cmd); + } + if (!pqpair->cq_vaddr && pqpair->cpl) { + spdk_free(pqpair->cpl); + } + if (pqpair->tr) { + spdk_free(pqpair->tr); + } + + nvme_qpair_deinit(qpair); + + spdk_free(pqpair); + + return 0; +} + +static void +nvme_pcie_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + nvme_pcie_qpair_abort_trackers(qpair, dnr); +} + +static int +nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, + void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ; + + cmd->cdw10_bits.create_io_q.qid = io_que->id; + cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; + + cmd->cdw11_bits.create_io_cq.pc = 1; + cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que); + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ; + + cmd->cdw10_bits.create_io_q.qid = io_que->id; + cmd->cdw10_bits.create_io_q.qsize = pqpair->num_entries - 1; + cmd->cdw11_bits.create_io_sq.pc = 1; + cmd->cdw11_bits.create_io_sq.qprio = io_que->qprio; + cmd->cdw11_bits.create_io_sq.cqid = io_que->id; + cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ; + cmd->cdw10_bits.delete_io_q.qid = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + spdk_nvme_cmd_cb cb_fn, void *cb_arg) +{ + struct nvme_request *req; + struct spdk_nvme_cmd *cmd; + + req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg); + if (req == NULL) { + return -ENOMEM; + } + + cmd = &req->cmd; + cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ; + cmd->cdw10_bits.delete_io_q.qid = qpair->id; + + return nvme_ctrlr_submit_admin_request(ctrlr, req); +} + +static int +_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair, + uint16_t qid) +{ + struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr); + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_completion_poll_status *status; + int rc; + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_create_io_cq failed!\n"); + if (!status->timed_out) { + free(status); + } + return -1; + } + + memset(status, 0, sizeof(*status)); + rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + free(status); + return rc; + } + + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + SPDK_ERRLOG("nvme_create_io_sq failed!\n"); + if (status->timed_out) { + /* Request is still queued, the memory will be freed in a completion callback. + allocate a new request */ + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + } + + memset(status, 0, sizeof(*status)); + /* Attempt to delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + /* The originall or newly allocated status structure can be freed since + * the corresponding request has been completed of failed to submit */ + free(status); + return -1; + } + nvme_wait_for_completion(ctrlr->adminq, status); + if (!status->timed_out) { + /* status can be freed regardless of nvme_wait_for_completion return value */ + free(status); + } + return -1; + } + + if (ctrlr->shadow_doorbell) { + pqpair->shadow_doorbell.sq_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.cq_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * + pctrlr->doorbell_stride_u32; + pqpair->shadow_doorbell.cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * + pctrlr->doorbell_stride_u32; + pqpair->flags.has_shadow_doorbell = 1; + } else { + pqpair->flags.has_shadow_doorbell = 0; + } + nvme_pcie_qpair_reset(qpair); + free(status); + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct nvme_pcie_qpair *pqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + assert(ctrlr != NULL); + + pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (pqpair == NULL) { + return NULL; + } + + pqpair->num_entries = opts->io_queue_size; + pqpair->flags.delay_cmd_submit = opts->delay_cmd_submit; + + qpair = &pqpair->qpair; + + rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests); + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + rc = nvme_pcie_qpair_construct(qpair, opts); + + if (rc != 0) { + nvme_pcie_qpair_destroy(qpair); + return NULL; + } + + return qpair; +} + +static int +nvme_pcie_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + if (nvme_qpair_is_admin_queue(qpair)) { + return 0; + } else { + return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id); + } +} + +static void +nvme_pcie_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ +} + +static int +nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_completion_poll_status *status; + int rc; + + assert(ctrlr != NULL); + + if (ctrlr->is_removed) { + goto free; + } + + status = calloc(1, sizeof(*status)); + if (!status) { + SPDK_ERRLOG("Failed to allocate status tracker\n"); + return -ENOMEM; + } + + /* Delete the I/O submission queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_sq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + + memset(status, 0, sizeof(*status)); + /* Delete the completion queue */ + rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, status); + if (rc != 0) { + SPDK_ERRLOG("Failed to send request to delete_io_cq with rc=%d\n", rc); + free(status); + return rc; + } + if (nvme_wait_for_completion(ctrlr->adminq, status)) { + if (!status->timed_out) { + free(status); + } + return -1; + } + free(status); + +free: + if (qpair->no_deletion_notification_needed == 0) { + /* Abort the rest of the I/O */ + nvme_pcie_qpair_abort_trackers(qpair, 1); + } + + nvme_pcie_qpair_destroy(qpair); + return 0; +} + +static void +nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr) +{ + /* + * Bad vtophys translation, so abort this request and return + * immediately. + */ + nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INVALID_FIELD, + 1 /* do not retry */, true); +} + +/* + * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes. + * + * *prp_index will be updated to account for the number of PRP entries used. + */ +static inline int +nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len, + uint32_t page_size) +{ + struct spdk_nvme_cmd *cmd = &tr->req->cmd; + uintptr_t page_mask = page_size - 1; + uint64_t phys_addr; + uint32_t i; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n", + *prp_index, virt_addr, (uint32_t)len); + + if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + return -EFAULT; + } + + i = *prp_index; + while (len) { + uint32_t seg_len; + + /* + * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array, + * so prp_index == count is valid. + */ + if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) { + SPDK_ERRLOG("out of PRP entries\n"); + return -EFAULT; + } + + phys_addr = spdk_vtophys(virt_addr, NULL); + if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) { + SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr); + return -EFAULT; + } + + if (i == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr); + cmd->dptr.prp.prp1 = phys_addr; + seg_len = page_size - ((uintptr_t)virt_addr & page_mask); + } else { + if ((phys_addr & page_mask) != 0) { + SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr); + return -EFAULT; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr); + tr->u.prp[i - 1] = phys_addr; + seg_len = page_size; + } + + seg_len = spdk_min(seg_len, len); + virt_addr += seg_len; + len -= seg_len; + i++; + } + + cmd->psdt = SPDK_NVME_PSDT_PRP; + if (i <= 1) { + cmd->dptr.prp.prp2 = 0; + } else if (i == 2) { + cmd->dptr.prp.prp2 = tr->u.prp[0]; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2); + } else { + cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2); + } + + *prp_index = i; + return 0; +} + +static int +nvme_pcie_qpair_build_request_invalid(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, struct nvme_tracker *tr, bool dword_aligned) +{ + assert(0); + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EINVAL; +} + +/** + * Build PRP list describing physically contiguous payload buffer. + */ +static int +nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + uint32_t prp_index = 0; + int rc; + + rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset, + req->payload_size, qpair->ctrlr->page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + } + + return rc; +} + +/** + * Build an SGL describing a physically contiguous payload buffer. + * + * This is more efficient than using PRP because large buffers can be + * described this way. + */ +static int +nvme_pcie_qpair_build_contig_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + void *virt_addr; + uint64_t phys_addr, mapping_length; + uint32_t length; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t nseg = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + sgl = tr->u.sgl; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.subtype = 0; + + length = req->payload_size; + virt_addr = req->payload.contig_or_cb_arg + req->payload_offset; + mapping_length = length; + + while (length > 0) { + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + if (dword_aligned && ((uintptr_t)virt_addr & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + phys_addr = spdk_vtophys(virt_addr, &mapping_length); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + mapping_length = spdk_min(length, mapping_length); + + length -= mapping_length; + virt_addr += mapping_length; + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + sgl->unkeyed.length = mapping_length; + sgl->address = phys_addr; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + } + + if (nseg == 1) { + /* + * The whole transfer can be described by a single SGL descriptor. + * Use the special case described by the spec where SGL1's type is Data Block. + * This means the SGL in the tracker is not used at all, so copy the first (and only) + * SGL element into SGL1. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; + req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; + } else { + /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because + * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; + req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); + } + + return 0; +} + +/** + * Build SGL list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + int rc; + void *virt_addr; + uint64_t phys_addr; + uint32_t remaining_transfer_len, remaining_user_sge_len, length; + struct spdk_nvme_sgl_descriptor *sgl; + uint32_t nseg = 0; + + /* + * Build scattered payloads. + */ + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + sgl = tr->u.sgl; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.subtype = 0; + + remaining_transfer_len = req->payload_size; + + while (remaining_transfer_len > 0) { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, + &virt_addr, &remaining_user_sge_len); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + /* Bit Bucket SGL descriptor */ + if ((uint64_t)virt_addr == UINT64_MAX) { + /* TODO: enable WRITE and COMPARE when necessary */ + if (req->cmd.opc != SPDK_NVME_OPC_READ) { + SPDK_ERRLOG("Only READ command can be supported\n"); + goto exit; + } + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + SPDK_ERRLOG("Too many SGL entries\n"); + goto exit; + } + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_BIT_BUCKET; + /* If the SGL describes a destination data buffer, the length of data + * buffer shall be discarded by controller, and the length is included + * in Number of Logical Blocks (NLB) parameter. Otherwise, the length + * is not included in the NLB parameter. + */ + remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); + remaining_transfer_len -= remaining_user_sge_len; + + sgl->unkeyed.length = remaining_user_sge_len; + sgl->address = 0; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + + continue; + } + + remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len); + remaining_transfer_len -= remaining_user_sge_len; + while (remaining_user_sge_len > 0) { + if (nseg >= NVME_MAX_SGL_DESCRIPTORS) { + SPDK_ERRLOG("Too many SGL entries\n"); + goto exit; + } + + if (dword_aligned && ((uintptr_t)virt_addr & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr); + goto exit; + } + + phys_addr = spdk_vtophys(virt_addr, NULL); + if (phys_addr == SPDK_VTOPHYS_ERROR) { + goto exit; + } + + length = spdk_min(remaining_user_sge_len, VALUE_2MB - _2MB_OFFSET(virt_addr)); + remaining_user_sge_len -= length; + virt_addr += length; + + if (nseg > 0 && phys_addr == + (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) { + /* extend previous entry */ + (*(sgl - 1)).unkeyed.length += length; + continue; + } + + sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + sgl->unkeyed.length = length; + sgl->address = phys_addr; + sgl->unkeyed.subtype = 0; + + sgl++; + nseg++; + } + } + + if (nseg == 1) { + /* + * The whole transfer can be described by a single SGL descriptor. + * Use the special case described by the spec where SGL1's type is Data Block. + * This means the SGL in the tracker is not used at all, so copy the first (and only) + * SGL element into SGL1. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.address = tr->u.sgl[0].address; + req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length; + } else { + /* SPDK NVMe driver supports only 1 SGL segment for now, it is enough because + * NVME_MAX_SGL_DESCRIPTORS * 16 is less than one page. + */ + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr; + req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor); + } + + return 0; + +exit: + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; +} + +/** + * Build PRP list describing scattered payload buffer. + */ +static int +nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req, + struct nvme_tracker *tr, bool dword_aligned) +{ + int rc; + void *virt_addr; + uint32_t remaining_transfer_len, length; + uint32_t prp_index = 0; + uint32_t page_size = qpair->ctrlr->page_size; + + /* + * Build scattered payloads. + */ + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + remaining_transfer_len = req->payload_size; + while (remaining_transfer_len > 0) { + assert(req->payload.next_sge_fn != NULL); + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EFAULT; + } + + length = spdk_min(remaining_transfer_len, length); + + /* + * Any incompatible sges should have been handled up in the splitting routine, + * but assert here as an additional check. + * + * All SGEs except last must end on a page boundary. + */ + assert((length == remaining_transfer_len) || + _is_page_aligned((uintptr_t)virt_addr + length, page_size)); + + rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size); + if (rc) { + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return rc; + } + + remaining_transfer_len -= length; + } + + return 0; +} + +typedef int(*build_req_fn)(struct spdk_nvme_qpair *, struct nvme_request *, struct nvme_tracker *, + bool); + +static build_req_fn const g_nvme_pcie_build_req_table[][2] = { + [NVME_PAYLOAD_TYPE_INVALID] = { + nvme_pcie_qpair_build_request_invalid, /* PRP */ + nvme_pcie_qpair_build_request_invalid /* SGL */ + }, + [NVME_PAYLOAD_TYPE_CONTIG] = { + nvme_pcie_qpair_build_contig_request, /* PRP */ + nvme_pcie_qpair_build_contig_hw_sgl_request /* SGL */ + }, + [NVME_PAYLOAD_TYPE_SGL] = { + nvme_pcie_qpair_build_prps_sgl_request, /* PRP */ + nvme_pcie_qpair_build_hw_sgl_request /* SGL */ + } +}; + +static int +nvme_pcie_qpair_build_metadata(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr, + bool sgl_supported, bool dword_aligned) +{ + void *md_payload; + struct nvme_request *req = tr->req; + + if (req->payload.md) { + md_payload = req->payload.md + req->md_offset; + if (dword_aligned && ((uintptr_t)md_payload & 3)) { + SPDK_ERRLOG("virt_addr %p not dword aligned\n", md_payload); + goto exit; + } + + if (sgl_supported && dword_aligned) { + assert(req->cmd.psdt == SPDK_NVME_PSDT_SGL_MPTR_CONTIG); + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_SGL; + tr->meta_sgl.address = spdk_vtophys(md_payload, NULL); + if (tr->meta_sgl.address == SPDK_VTOPHYS_ERROR) { + goto exit; + } + tr->meta_sgl.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + tr->meta_sgl.unkeyed.length = req->md_size; + tr->meta_sgl.unkeyed.subtype = 0; + req->cmd.mptr = tr->prp_sgl_bus_addr - sizeof(struct spdk_nvme_sgl_descriptor); + } else { + req->cmd.mptr = spdk_vtophys(md_payload, NULL); + if (req->cmd.mptr == SPDK_VTOPHYS_ERROR) { + goto exit; + } + } + } + + return 0; + +exit: + nvme_pcie_fail_request_bad_vtophys(qpair, tr); + return -EINVAL; +} + +static int +nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + struct nvme_tracker *tr; + int rc = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + enum nvme_payload_type payload_type; + bool sgl_supported; + bool dword_aligned = true; + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + tr = TAILQ_FIRST(&pqpair->free_tr); + + if (tr == NULL) { + /* Inform the upper layer to try again later. */ + rc = -EAGAIN; + goto exit; + } + + TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */ + TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list); + tr->req = req; + tr->cb_fn = req->cb_fn; + tr->cb_arg = req->cb_arg; + req->cmd.cid = tr->cid; + + if (req->payload_size != 0) { + payload_type = nvme_payload_type(&req->payload); + /* According to the specification, PRPs shall be used for all + * Admin commands for NVMe over PCIe implementations. + */ + sgl_supported = (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) != 0 && + !nvme_qpair_is_admin_queue(qpair); + + if (sgl_supported && !(ctrlr->flags & SPDK_NVME_CTRLR_SGL_REQUIRES_DWORD_ALIGNMENT)) { + dword_aligned = false; + } + rc = g_nvme_pcie_build_req_table[payload_type][sgl_supported](qpair, req, tr, dword_aligned); + if (rc < 0) { + goto exit; + } + + rc = nvme_pcie_qpair_build_metadata(qpair, tr, sgl_supported, dword_aligned); + if (rc < 0) { + goto exit; + } + } + + nvme_pcie_qpair_submit_tracker(qpair, tr); + +exit: + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return rc; +} + +static void +nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct nvme_tracker *tr, *tmp; + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) { + assert(tr->req != NULL); + + if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static int32_t +nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair); + struct nvme_tracker *tr; + struct spdk_nvme_cpl *cpl, *next_cpl; + uint32_t num_completions = 0; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + uint16_t next_cq_head; + uint8_t next_phase; + bool next_is_valid = false; + + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_robust_mutex_lock(&ctrlr->ctrlr_lock); + } + + if (max_completions == 0 || max_completions > pqpair->max_completions_cap) { + /* + * max_completions == 0 means unlimited, but complete at most + * max_completions_cap batch of I/O at a time so that the completion + * queue doorbells don't wrap around. + */ + max_completions = pqpair->max_completions_cap; + } + + while (1) { + cpl = &pqpair->cpl[pqpair->cq_head]; + + if (!next_is_valid && cpl->status.p != pqpair->flags.phase) { + break; + } + + if (spdk_likely(pqpair->cq_head + 1 != pqpair->num_entries)) { + next_cq_head = pqpair->cq_head + 1; + next_phase = pqpair->flags.phase; + } else { + next_cq_head = 0; + next_phase = !pqpair->flags.phase; + } + next_cpl = &pqpair->cpl[next_cq_head]; + next_is_valid = (next_cpl->status.p == next_phase); + if (next_is_valid) { + __builtin_prefetch(&pqpair->tr[next_cpl->cid]); + } + +#ifdef __PPC64__ + /* + * This memory barrier prevents reordering of: + * - load after store from/to tr + * - load after load cpl phase and cpl cid + */ + spdk_mb(); +#elif defined(__aarch64__) + __asm volatile("dmb oshld" ::: "memory"); +#endif + + if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) { + pqpair->cq_head = 0; + pqpair->flags.phase = !pqpair->flags.phase; + } + + tr = &pqpair->tr[cpl->cid]; + /* Prefetch the req's STAILQ_ENTRY since we'll need to access it + * as part of putting the req back on the qpair's free list. + */ + __builtin_prefetch(&tr->req->stailq); + pqpair->sq_head = cpl->sqhd; + + if (tr->req) { + nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true); + } else { + SPDK_ERRLOG("cpl does not map to outstanding cmd\n"); + spdk_nvme_qpair_print_completion(qpair, cpl); + assert(0); + } + + if (++num_completions == max_completions) { + break; + } + } + + if (num_completions > 0) { + nvme_pcie_qpair_ring_cq_doorbell(qpair); + } + + if (pqpair->flags.delay_cmd_submit) { + if (pqpair->last_sq_tail != pqpair->sq_tail) { + nvme_pcie_qpair_ring_sq_doorbell(qpair); + pqpair->last_sq_tail = pqpair->sq_tail; + } + } + + if (spdk_unlikely(ctrlr->timeout_enabled)) { + /* + * User registered for timeout callback + */ + nvme_pcie_qpair_check_timeout(qpair); + } + + /* Before returning, complete any pending admin request. */ + if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) { + nvme_pcie_qpair_complete_pending_admin_request(qpair); + + nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock); + } + + return num_completions; +} + +static struct spdk_nvme_transport_poll_group * +nvme_pcie_poll_group_create(void) +{ + struct nvme_pcie_poll_group *group = calloc(1, sizeof(*group)); + + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + return &group->group; +} + +static int +nvme_pcie_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_pcie_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int64_t +nvme_pcie_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + int32_t local_completions = 0; + int64_t total_completions = 0; + + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + local_completions = spdk_nvme_qpair_process_completions(qpair, completions_per_qpair); + if (local_completions < 0) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + local_completions = 0; + } + total_completions += local_completions; + } + + return total_completions; +} + +static int +nvme_pcie_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + free(tgroup); + + return 0; +} + +static struct spdk_pci_id nvme_pci_driver_id[] = { + { + .class_id = SPDK_PCI_CLASS_NVME, + .vendor_id = SPDK_PCI_ANY_ID, + .device_id = SPDK_PCI_ANY_ID, + .subvendor_id = SPDK_PCI_ANY_ID, + .subdevice_id = SPDK_PCI_ANY_ID, + }, + { .vendor_id = 0, /* sentinel */ }, +}; + +SPDK_PCI_DRIVER_REGISTER("nvme", nvme_pci_driver_id, + SPDK_PCI_DRIVER_NEED_MAPPING | SPDK_PCI_DRIVER_WC_ACTIVATE); + +const struct spdk_nvme_transport_ops pcie_ops = { + .name = "PCIE", + .type = SPDK_NVME_TRANSPORT_PCIE, + .ctrlr_construct = nvme_pcie_ctrlr_construct, + .ctrlr_scan = nvme_pcie_ctrlr_scan, + .ctrlr_destruct = nvme_pcie_ctrlr_destruct, + .ctrlr_enable = nvme_pcie_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_pcie_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_pcie_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_pcie_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_pcie_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_pcie_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_pcie_ctrlr_get_max_sges, + + .ctrlr_reserve_cmb = nvme_pcie_ctrlr_reserve_cmb, + .ctrlr_map_cmb = nvme_pcie_ctrlr_map_io_cmb, + .ctrlr_unmap_cmb = nvme_pcie_ctrlr_unmap_io_cmb, + + .ctrlr_create_io_qpair = nvme_pcie_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_pcie_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_pcie_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_pcie_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_pcie_qpair_abort_reqs, + .qpair_reset = nvme_pcie_qpair_reset, + .qpair_submit_request = nvme_pcie_qpair_submit_request, + .qpair_process_completions = nvme_pcie_qpair_process_completions, + .qpair_iterate_requests = nvme_pcie_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_pcie_admin_qpair_abort_aers, + + .poll_group_create = nvme_pcie_poll_group_create, + .poll_group_connect_qpair = nvme_pcie_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_pcie_poll_group_disconnect_qpair, + .poll_group_add = nvme_pcie_poll_group_add, + .poll_group_remove = nvme_pcie_poll_group_remove, + .poll_group_process_completions = nvme_pcie_poll_group_process_completions, + .poll_group_destroy = nvme_pcie_poll_group_destroy, +}; + +SPDK_NVME_TRANSPORT_REGISTER(pcie, &pcie_ops); diff --git a/src/spdk/lib/nvme/nvme_poll_group.c b/src/spdk/lib/nvme/nvme_poll_group.c new file mode 100644 index 000000000..291f55e63 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_poll_group.c @@ -0,0 +1,164 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "nvme_internal.h" + +struct spdk_nvme_poll_group * +spdk_nvme_poll_group_create(void *ctx) +{ + struct spdk_nvme_poll_group *group; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + return NULL; + } + + group->ctx = ctx; + STAILQ_INIT(&group->tgroups); + + return group; +} + +int +spdk_nvme_poll_group_add(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + const struct spdk_nvme_transport *transport; + + if (nvme_qpair_get_state(qpair) != NVME_QPAIR_DISCONNECTED) { + return -EINVAL; + } + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + break; + } + } + + /* See if a new transport has been added (dlopen style) and we need to update the poll group */ + if (!tgroup) { + transport = nvme_get_first_transport(); + while (transport != NULL) { + if (transport == qpair->transport) { + tgroup = nvme_transport_poll_group_create(transport); + if (tgroup == NULL) { + return -ENOMEM; + } + tgroup->group = group; + STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + break; + } + transport = nvme_get_next_transport(transport); + } + } + + return tgroup ? nvme_transport_poll_group_add(tgroup, qpair) : -ENODEV; +} + +int +spdk_nvme_poll_group_remove(struct spdk_nvme_poll_group *group, struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + return nvme_transport_poll_group_remove(tgroup, qpair); + } + } + + return -ENODEV; +} + +int +nvme_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + return nvme_transport_poll_group_connect_qpair(qpair); +} + +int +nvme_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + return nvme_transport_poll_group_disconnect_qpair(qpair); +} + +int64_t +spdk_nvme_poll_group_process_completions(struct spdk_nvme_poll_group *group, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int64_t local_completions = 0, error_reason = 0, num_completions = 0; + + if (disconnected_qpair_cb == NULL) { + return -EINVAL; + } + + STAILQ_FOREACH(tgroup, &group->tgroups, link) { + local_completions = nvme_transport_poll_group_process_completions(tgroup, completions_per_qpair, + disconnected_qpair_cb); + if (local_completions < 0 && error_reason == 0) { + error_reason = local_completions; + } else { + num_completions += local_completions; + /* Just to be safe */ + assert(num_completions >= 0); + } + } + + return error_reason ? error_reason : num_completions; +} + +void * +spdk_nvme_poll_group_get_ctx(struct spdk_nvme_poll_group *group) +{ + return group->ctx; +} + +int +spdk_nvme_poll_group_destroy(struct spdk_nvme_poll_group *group) +{ + struct spdk_nvme_transport_poll_group *tgroup, *tmp_tgroup; + + STAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp_tgroup) { + STAILQ_REMOVE(&group->tgroups, tgroup, spdk_nvme_transport_poll_group, link); + if (nvme_transport_poll_group_destroy(tgroup) != 0) { + STAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + return -EBUSY; + } + + } + + free(group); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c new file mode 100644 index 000000000..a3fdc2169 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_qpair.c @@ -0,0 +1,1064 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" +#include "spdk/nvme_ocssd.h" + +#define NVME_CMD_DPTR_STR_SIZE 256 + +static int nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); + +struct nvme_string { + uint16_t value; + const char *str; +}; + +static const struct nvme_string admin_opcode[] = { + { SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, + { SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, + { SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, + { SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, + { SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, + { SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" }, + { SPDK_NVME_OPC_ABORT, "ABORT" }, + { SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" }, + { SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" }, + { SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, + { SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" }, + { SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" }, + { SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, + { SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" }, + { SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" }, + { SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" }, + { SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" }, + { SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" }, + { SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" }, + { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, + { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, + { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, + { SPDK_NVME_OPC_FABRIC, "FABRIC" }, + { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, + { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, + { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, + { SPDK_NVME_OPC_SANITIZE, "SANITIZE" }, + { SPDK_NVME_OPC_GET_LBA_STATUS, "GET LBA STATUS" }, + { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" }, + { 0xFFFF, "ADMIN COMMAND" } +}; + +static const struct nvme_string fabric_opcode[] = { + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET, "PROPERTY SET" }, + { SPDK_NVMF_FABRIC_COMMAND_CONNECT, "CONNECT" }, + { SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET, "PROPERTY GET" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_SEND, "AUTHENTICATION SEND" }, + { SPDK_NVMF_FABRIC_COMMAND_AUTHENTICATION_RECV, "AUTHENTICATION RECV" }, + { 0xFFFF, "RESERVED / VENDOR SPECIFIC" } +}; + +static const struct nvme_string feat_opcode[] = { + { SPDK_NVME_FEAT_ARBITRATION, "ARBITRATION" }, + { SPDK_NVME_FEAT_POWER_MANAGEMENT, "POWER MANAGEMENT" }, + { SPDK_NVME_FEAT_LBA_RANGE_TYPE, "LBA RANGE TYPE" }, + { SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD, "TEMPERATURE THRESHOLD" }, + { SPDK_NVME_FEAT_ERROR_RECOVERY, "ERROR_RECOVERY" }, + { SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE, "VOLATILE WRITE CACHE" }, + { SPDK_NVME_FEAT_NUMBER_OF_QUEUES, "NUMBER OF QUEUES" }, + { SPDK_NVME_FEAT_INTERRUPT_COALESCING, "INTERRUPT COALESCING" }, + { SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION, "INTERRUPT VECTOR CONFIGURATION" }, + { SPDK_NVME_FEAT_WRITE_ATOMICITY, "WRITE ATOMICITY" }, + { SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, "ASYNC EVENT CONFIGURATION" }, + { SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION, "AUTONOMOUS POWER STATE TRANSITION" }, + { SPDK_NVME_FEAT_HOST_MEM_BUFFER, "HOST MEM BUFFER" }, + { SPDK_NVME_FEAT_TIMESTAMP, "TIMESTAMP" }, + { SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, "KEEP ALIVE TIMER" }, + { SPDK_NVME_FEAT_HOST_CONTROLLED_THERMAL_MANAGEMENT, "HOST CONTROLLED THERMAL MANAGEMENT" }, + { SPDK_NVME_FEAT_NON_OPERATIONAL_POWER_STATE_CONFIG, "NON OPERATIONAL POWER STATE CONFIG" }, + { SPDK_NVME_FEAT_SOFTWARE_PROGRESS_MARKER, "SOFTWARE PROGRESS MARKER" }, + { SPDK_NVME_FEAT_HOST_IDENTIFIER, "HOST IDENTIFIER" }, + { SPDK_NVME_FEAT_HOST_RESERVE_MASK, "HOST RESERVE MASK" }, + { SPDK_NVME_FEAT_HOST_RESERVE_PERSIST, "HOST RESERVE PERSIST" }, + { 0xFFFF, "RESERVED" } +}; + +static const struct nvme_string io_opcode[] = { + { SPDK_NVME_OPC_FLUSH, "FLUSH" }, + { SPDK_NVME_OPC_WRITE, "WRITE" }, + { SPDK_NVME_OPC_READ, "READ" }, + { SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, + { SPDK_NVME_OPC_COMPARE, "COMPARE" }, + { SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" }, + { SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, + { SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" }, + { SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" }, + { SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" }, + { SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" }, + { SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" }, + { SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" }, + { SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" }, + { SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" }, + { 0xFFFF, "IO COMMAND" } +}; + +static const struct nvme_string sgl_type[] = { + { SPDK_NVME_SGL_TYPE_DATA_BLOCK, "DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_BIT_BUCKET, "BIT BUCKET" }, + { SPDK_NVME_SGL_TYPE_SEGMENT, "SEGMENT" }, + { SPDK_NVME_SGL_TYPE_LAST_SEGMENT, "LAST SEGMENT" }, + { SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK, "TRANSPORT DATA BLOCK" }, + { SPDK_NVME_SGL_TYPE_VENDOR_SPECIFIC, "VENDOR SPECIFIC" }, + { 0xFFFF, "RESERVED" } +}; + +static const struct nvme_string sgl_subtype[] = { + { SPDK_NVME_SGL_SUBTYPE_ADDRESS, "ADDRESS" }, + { SPDK_NVME_SGL_SUBTYPE_OFFSET, "OFFSET" }, + { SPDK_NVME_SGL_SUBTYPE_TRANSPORT, "TRANSPORT" }, + { SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY, "INVALIDATE KEY" }, + { 0xFFFF, "RESERVED" } +}; + +static const char * +nvme_get_string(const struct nvme_string *strings, uint16_t value) +{ + const struct nvme_string *entry; + + entry = strings; + + while (entry->value != 0xFFFF) { + if (entry->value == value) { + return entry->str; + } + entry++; + } + return entry->str; +} + +static void +nvme_get_sgl_unkeyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + + snprintf(buf, size, " len:0x%x", sgl->unkeyed.length); +} + +static void +nvme_get_sgl_keyed(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + + snprintf(buf, size, " len:0x%x key:0x%x", sgl->keyed.length, sgl->keyed.key); +} + +static void +nvme_get_sgl(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + int c; + + c = snprintf(buf, size, "SGL %s %s 0x%" PRIx64, nvme_get_string(sgl_type, sgl->generic.type), + nvme_get_string(sgl_subtype, sgl->generic.subtype), sgl->address); + assert(c >= 0 && (size_t)c < size); + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { + nvme_get_sgl_unkeyed(buf + c, size - c, cmd); + } + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { + nvme_get_sgl_keyed(buf + c, size - c, cmd); + } +} + +static void +nvme_get_prp(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + snprintf(buf, size, "PRP1 0x%" PRIx64 " PRP2 0x%" PRIx64, cmd->dptr.prp.prp1, cmd->dptr.prp.prp2); +} + +static void +nvme_get_dptr(char *buf, size_t size, struct spdk_nvme_cmd *cmd) +{ + if (spdk_nvme_opc_get_data_transfer(cmd->opc) != SPDK_NVME_DATA_NONE) { + switch (cmd->psdt) { + case SPDK_NVME_PSDT_PRP: + nvme_get_prp(buf, size, cmd); + break; + case SPDK_NVME_PSDT_SGL_MPTR_CONTIG: + case SPDK_NVME_PSDT_SGL_MPTR_SGL: + nvme_get_sgl(buf, size, cmd); + break; + default: + ; + } + } +} + +static void +nvme_admin_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + struct spdk_nvmf_capsule_cmd *fcmd = (void *)cmd; + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_SET_FEATURES: + case SPDK_NVME_OPC_GET_FEATURES: + SPDK_NOTICELOG("%s %s cid:%d cdw10:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(feat_opcode, + cmd->cdw10_bits.set_features.fid), cmd->cid, cmd->cdw10, dptr); + break; + case SPDK_NVME_OPC_FABRIC: + SPDK_NOTICELOG("%s %s qid:%d cid:%d %s\n", + nvme_get_string(admin_opcode, cmd->opc), nvme_get_string(fabric_opcode, fcmd->fctype), qid, + fcmd->cid, dptr); + break; + default: + SPDK_NOTICELOG("%s (%02x) qid:%d cid:%d nsid:%x cdw10:%08x cdw11:%08x %s\n", + nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid, cmd->cdw10, + cmd->cdw11, dptr); + } +} + +static void +nvme_io_qpair_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + char dptr[NVME_CMD_DPTR_STR_SIZE] = {'\0'}; + + assert(cmd != NULL); + + nvme_get_dptr(dptr, sizeof(dptr), cmd); + + switch ((int)cmd->opc) { + case SPDK_NVME_OPC_WRITE: + case SPDK_NVME_OPC_READ: + case SPDK_NVME_OPC_WRITE_UNCORRECTABLE: + case SPDK_NVME_OPC_COMPARE: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d " + "lba:%llu len:%d %s\n", + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid, + ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10, + (cmd->cdw12 & 0xFFFF) + 1, dptr); + break; + case SPDK_NVME_OPC_FLUSH: + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), qid, cmd->cid, cmd->nsid); + break; + default: + SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n", + nvme_get_string(io_opcode, cmd->opc), cmd->opc, qid, cmd->cid, cmd->nsid); + break; + } +} + +void +spdk_nvme_print_command(uint16_t qid, struct spdk_nvme_cmd *cmd) +{ + assert(cmd != NULL); + + if (qid == 0 || cmd->opc == SPDK_NVME_OPC_FABRIC) { + nvme_admin_qpair_print_command(qid, cmd); + } else { + nvme_io_qpair_print_command(qid, cmd); + } +} + +void +spdk_nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd) +{ + assert(qpair != NULL); + assert(cmd != NULL); + + spdk_nvme_print_command(qpair->id, cmd); +} + +static const struct nvme_string generic_status[] = { + { SPDK_NVME_SC_SUCCESS, "SUCCESS" }, + { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, + { SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" }, + { SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, + { SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, + { SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, + { SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, + { SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, + { SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, + { SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, + { SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, + { SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, + { SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" }, + { SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" }, + { SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, + { SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, + { SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" }, + { SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" }, + { SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, + { SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, + { SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" }, + { SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" }, + { SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" }, + { SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" }, + { SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" }, + { SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, + { SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, + { SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" }, + { SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, + { SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, + { SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, + { SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, + { SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, + { SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, + { 0xFFFF, "GENERIC" } +}; + +static const struct nvme_string command_specific_status[] = { + { SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, + { SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, + { SPDK_NVME_SC_INVALID_QUEUE_SIZE, "INVALID QUEUE SIZE" }, + { SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, + { SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, + { SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, + { SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, + { SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, + { SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, + { SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" }, + { SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, + { SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, + { SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, + { SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" }, + { SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" }, + { SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, + { SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, + { SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, + { SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" }, + { SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, + { SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, + { SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" }, + { SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" }, + { SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" }, + { SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" }, + { SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" }, + { SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" }, + { SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, + { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" }, + { SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, + { SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, + { SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, + { SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_RANGE, "WRITE TO RO RANGE" }, + { 0xFFFF, "COMMAND SPECIFIC" } +}; + +static const struct nvme_string media_error_status[] = { + { SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, + { SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, + { SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, + { SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, + { SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, + { SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, + { SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, + { SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" }, + { SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" }, + { SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" }, + { SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" }, + { SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" }, + { SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" }, + { SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" }, + { 0xFFFF, "MEDIA ERROR" } +}; + +static const struct nvme_string path_status[] = { + { SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" }, + { SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" }, + { SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" }, + { SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" }, + { 0xFFFF, "PATH ERROR" } +}; + +const char * +spdk_nvme_cpl_get_status_string(const struct spdk_nvme_status *status) +{ + const struct nvme_string *entry; + + switch (status->sct) { + case SPDK_NVME_SCT_GENERIC: + entry = generic_status; + break; + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + entry = command_specific_status; + break; + case SPDK_NVME_SCT_MEDIA_ERROR: + entry = media_error_status; + break; + case SPDK_NVME_SCT_PATH: + entry = path_status; + break; + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + return "VENDOR SPECIFIC"; + default: + return "RESERVED"; + } + + return nvme_get_string(entry, status->sc); +} + +void +spdk_nvme_print_completion(uint16_t qid, struct spdk_nvme_cpl *cpl) +{ + assert(cpl != NULL); + + /* Check that sqid matches qid. Note that sqid is reserved + * for fabrics so don't print an error when sqid is 0. */ + if (cpl->sqid != qid && cpl->sqid != 0) { + SPDK_ERRLOG("sqid %u doesn't match qid\n", cpl->sqid); + } + + SPDK_NOTICELOG("%s (%02x/%02x) qid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n", + spdk_nvme_cpl_get_status_string(&cpl->status), + cpl->status.sct, cpl->status.sc, qid, cpl->cid, cpl->cdw0, + cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr); +} + +void +spdk_nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl) +{ + spdk_nvme_print_completion(qpair->id, cpl); +} + +bool +nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl) +{ + /* + * TODO: spec is not clear how commands that are aborted due + * to TLER will be marked. So for now, it seems + * NAMESPACE_NOT_READY is the only case where we should + * look at the DNR bit. + */ + switch ((int)cpl->status.sct) { + case SPDK_NVME_SCT_GENERIC: + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_NAMESPACE_NOT_READY: + case SPDK_NVME_SC_FORMAT_IN_PROGRESS: + if (cpl->status.dnr) { + return false; + } else { + return true; + } + case SPDK_NVME_SC_INVALID_OPCODE: + case SPDK_NVME_SC_INVALID_FIELD: + case SPDK_NVME_SC_COMMAND_ID_CONFLICT: + case SPDK_NVME_SC_DATA_TRANSFER_ERROR: + case SPDK_NVME_SC_ABORTED_POWER_LOSS: + case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR: + case SPDK_NVME_SC_ABORTED_BY_REQUEST: + case SPDK_NVME_SC_ABORTED_SQ_DELETION: + case SPDK_NVME_SC_ABORTED_FAILED_FUSED: + case SPDK_NVME_SC_ABORTED_MISSING_FUSED: + case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT: + case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR: + case SPDK_NVME_SC_LBA_OUT_OF_RANGE: + case SPDK_NVME_SC_CAPACITY_EXCEEDED: + default: + return false; + } + case SPDK_NVME_SCT_PATH: + /* + * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be + * based on the setting of the DNR bit for Internal Path Error + */ + switch ((int)cpl->status.sc) { + case SPDK_NVME_SC_INTERNAL_PATH_ERROR: + return !cpl->status.dnr; + default: + return false; + } + case SPDK_NVME_SCT_COMMAND_SPECIFIC: + case SPDK_NVME_SCT_MEDIA_ERROR: + case SPDK_NVME_SCT_VENDOR_SPECIFIC: + default: + return false; + } +} + +static void +nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req, uint32_t sct, uint32_t sc, + uint32_t dnr, bool print_on_error) +{ + struct spdk_nvme_cpl cpl; + bool error; + + memset(&cpl, 0, sizeof(cpl)); + cpl.sqid = qpair->id; + cpl.status.sct = sct; + cpl.status.sc = sc; + cpl.status.dnr = dnr; + + error = spdk_nvme_cpl_is_error(&cpl); + + if (error && print_on_error && !qpair->ctrlr->opts.disable_error_logging) { + SPDK_NOTICELOG("Command completed manually:\n"); + spdk_nvme_qpair_print_command(qpair, &req->cmd); + spdk_nvme_qpair_print_completion(qpair, &cpl); + } + + nvme_complete_request(req->cb_fn, req->cb_arg, qpair, req, &cpl); + nvme_free_request(req); +} + +static void +_nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting queued i/o\n"); + } + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true); + } +} + +/* The callback to a request may submit the next request which is queued and + * then the same callback may abort it immediately. This repetition may cause + * infinite recursive calls. Hence move aborting requests to another list here + * and abort them later at resubmission. + */ +static void +_nvme_qpair_complete_abort_queued_reqs(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->aborting_queued_req)) { + req = STAILQ_FIRST(&qpair->aborting_queued_req); + STAILQ_REMOVE_HEAD(&qpair->aborting_queued_req, stailq); + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_ABORTED_BY_REQUEST, 1, true); + } +} + +uint32_t +nvme_qpair_abort_queued_reqs(struct spdk_nvme_qpair *qpair, void *cmd_cb_arg) +{ + struct nvme_request *req, *tmp; + uint32_t aborting = 0; + + STAILQ_FOREACH_SAFE(req, &qpair->queued_req, stailq, tmp) { + if (req->cb_arg == cmd_cb_arg) { + STAILQ_REMOVE(&qpair->queued_req, req, nvme_request, stailq); + STAILQ_INSERT_TAIL(&qpair->aborting_queued_req, req, stailq); + if (!qpair->ctrlr->opts.disable_error_logging) { + SPDK_ERRLOG("aborting queued i/o\n"); + } + aborting++; + } + } + + return aborting; +} + +static inline bool +nvme_qpair_check_enabled(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + /* + * Either during initial connect or reset, the qpair should follow the given state machine. + * QPAIR_DISABLED->QPAIR_CONNECTING->QPAIR_CONNECTED->QPAIR_ENABLING->QPAIR_ENABLED. In the + * reset case, once the qpair is properly connected, we need to abort any outstanding requests + * from the old transport connection and encourage the application to retry them. We also need + * to submit any queued requests that built up while we were in the connected or enabling state. + */ + if (nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTED && !qpair->ctrlr->is_resetting) { + nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLING); + /* + * PCIe is special, for fabrics transports, we can abort requests before disconnect during reset + * but we have historically not disconnected pcie qpairs during reset so we have to abort requests + * here. + */ + if (qpair->ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) { + nvme_qpair_abort_reqs(qpair, 0); + } + nvme_qpair_set_state(qpair, NVME_QPAIR_ENABLED); + while (!STAILQ_EMPTY(&qpair->queued_req)) { + req = STAILQ_FIRST(&qpair->queued_req); + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + if (nvme_qpair_resubmit_request(qpair, req)) { + break; + } + } + } + + /* + * When doing a reset, we must disconnect the qpair on the proper core. + * Note, reset is the only case where we set the failure reason without + * setting the qpair state since reset is done at the generic layer on the + * controller thread and we can't disconnect I/O qpairs from the controller + * thread. + */ + if (qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE && + nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) { + /* Don't disconnect PCIe qpairs. They are a special case for reset. */ + if (qpair->ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) { + nvme_ctrlr_disconnect_qpair(qpair); + } + return false; + } + + return nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED; +} + +void +nvme_qpair_resubmit_requests(struct spdk_nvme_qpair *qpair, uint32_t num_requests) +{ + uint32_t i; + int resubmit_rc; + struct nvme_request *req; + + for (i = 0; i < num_requests; i++) { + if (qpair->ctrlr->is_resetting) { + break; + } + if ((req = STAILQ_FIRST(&qpair->queued_req)) == NULL) { + break; + } + STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); + resubmit_rc = nvme_qpair_resubmit_request(qpair, req); + if (spdk_unlikely(resubmit_rc != 0)) { + SPDK_ERRLOG("Unable to resubmit as many requests as we completed.\n"); + break; + } + } + + _nvme_qpair_complete_abort_queued_reqs(qpair); +} + +int32_t +spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + int32_t ret; + struct nvme_request *req, *tmp; + + if (spdk_unlikely(qpair->ctrlr->is_failed)) { + if (qpair->ctrlr->is_removed) { + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + nvme_qpair_abort_reqs(qpair, 1 /* Do not retry */); + } + return -ENXIO; + } + + if (spdk_unlikely(!nvme_qpair_check_enabled(qpair) && + !(nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING))) { + /* + * qpair is not enabled, likely because a controller reset is + * in progress. + */ + return -ENXIO; + } + + /* error injection for those queued error requests */ + if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) { + STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) { + if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) { + STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, 0, true); + } + } + } + + qpair->in_completion_context = 1; + ret = nvme_transport_qpair_process_completions(qpair, max_completions); + if (ret < 0) { + SPDK_ERRLOG("CQ error, abort requests after transport retry counter exceeded\n"); + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_ctrlr_fail(qpair->ctrlr, false); + } + } + qpair->in_completion_context = 0; + if (qpair->delete_after_completion_context) { + /* + * A request to delete this qpair was made in the context of this completion + * routine - so it is safe to delete it now. + */ + spdk_nvme_ctrlr_free_io_qpair(qpair); + return ret; + } + + /* + * At this point, ret must represent the number of completions we reaped. + * submit as many queued requests as we completed. + */ + nvme_qpair_resubmit_requests(qpair, ret); + + return ret; +} + +spdk_nvme_qp_failure_reason +spdk_nvme_qpair_get_failure_reason(struct spdk_nvme_qpair *qpair) +{ + return qpair->transport_failure_reason; +} + +int +nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id, + struct spdk_nvme_ctrlr *ctrlr, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + size_t req_size_padded; + uint32_t i; + + qpair->id = id; + qpair->qprio = qprio; + + qpair->in_completion_context = 0; + qpair->delete_after_completion_context = 0; + qpair->no_deletion_notification_needed = 0; + + qpair->ctrlr = ctrlr; + qpair->trtype = ctrlr->trid.trtype; + + STAILQ_INIT(&qpair->free_req); + STAILQ_INIT(&qpair->queued_req); + STAILQ_INIT(&qpair->aborting_queued_req); + TAILQ_INIT(&qpair->err_cmd_head); + STAILQ_INIT(&qpair->err_req_head); + + req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63; + + qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE); + if (qpair->req_buf == NULL) { + SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n", + ctrlr->cntlid, qpair->id, num_requests); + return -ENOMEM; + } + + for (i = 0; i < num_requests; i++) { + struct nvme_request *req = qpair->req_buf + i * req_size_padded; + + req->qpair = qpair; + STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq); + } + + return 0; +} + +void +nvme_qpair_complete_error_reqs(struct spdk_nvme_qpair *qpair) +{ + struct nvme_request *req; + + while (!STAILQ_EMPTY(&qpair->err_req_head)) { + req = STAILQ_FIRST(&qpair->err_req_head); + STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq); + nvme_qpair_manual_complete_request(qpair, req, + req->cpl.status.sct, + req->cpl.status.sc, 0, true); + } +} + +void +nvme_qpair_deinit(struct spdk_nvme_qpair *qpair) +{ + struct nvme_error_cmd *cmd, *entry; + + _nvme_qpair_abort_queued_reqs(qpair, 1); + _nvme_qpair_complete_abort_queued_reqs(qpair); + nvme_qpair_complete_error_reqs(qpair); + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_free(cmd); + } + + spdk_free(qpair->req_buf); +} + +static inline int +_nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc = 0; + struct nvme_request *child_req, *tmp; + struct nvme_error_cmd *cmd; + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + bool child_req_failed = false; + + nvme_qpair_check_enabled(qpair); + + if (req->num_children) { + /* + * This is a split (parent) request. Submit all of the children but not the parent + * request itself, since the parent is the original unsplit request. + */ + TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) { + if (spdk_likely(!child_req_failed)) { + rc = nvme_qpair_submit_request(qpair, child_req); + if (spdk_unlikely(rc != 0)) { + child_req_failed = true; + } + } else { /* free remaining child_reqs since one child_req fails */ + nvme_request_remove_child(req, child_req); + nvme_request_free_children(child_req); + nvme_free_request(child_req); + } + } + + if (spdk_unlikely(child_req_failed)) { + /* part of children requests have been submitted, + * return success since we must wait for those children to complete, + * but set the parent request to failure. + */ + if (req->num_children) { + req->cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return 0; + } + goto error; + } + + return rc; + } + + /* queue those requests which matches with opcode in err_cmd list */ + if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) { + TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) { + if (!cmd->do_not_submit) { + continue; + } + + if ((cmd->opc == req->cmd.opc) && cmd->err_count) { + /* add to error request list and set cpl */ + req->timeout_tsc = cmd->timeout_tsc; + req->submit_tick = spdk_get_ticks(); + req->cpl.status.sct = cmd->status.sct; + req->cpl.status.sc = cmd->status.sc; + STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq); + cmd->err_count--; + return 0; + } + } + } + + if (spdk_unlikely(ctrlr->is_failed)) { + rc = -ENXIO; + goto error; + } + + /* assign submit_tick before submitting req to specific transport */ + if (spdk_unlikely(ctrlr->timeout_enabled)) { + if (req->submit_tick == 0) { /* req submitted for the first time */ + req->submit_tick = spdk_get_ticks(); + req->timed_out = false; + } + } else { + req->submit_tick = 0; + } + + /* Allow two cases: + * 1. NVMe qpair is enabled. + * 2. Always allow fabrics commands through - these get + * the controller out of reset state. + */ + if (spdk_likely(nvme_qpair_get_state(qpair) == NVME_QPAIR_ENABLED) || + (req->cmd.opc == SPDK_NVME_OPC_FABRIC && + nvme_qpair_get_state(qpair) == NVME_QPAIR_CONNECTING)) { + rc = nvme_transport_qpair_submit_request(qpair, req); + } else { + /* The controller is being reset - queue this request and + * submit it later when the reset is completed. + */ + return -EAGAIN; + } + + if (spdk_likely(rc == 0)) { + req->queued = false; + return 0; + } + + if (rc == -EAGAIN) { + return -EAGAIN; + } + +error: + if (req->parent != NULL) { + nvme_request_remove_child(req->parent, req); + } + + /* The request is from queued_req list we should trigger the callback from caller */ + if (spdk_unlikely(req->queued)) { + nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC, + SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, true, true); + return rc; + } + + nvme_free_request(req); + + return rc; +} + +int +nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc; + + /* This prevents us from entering an infinite loop when freeing queued I/O in disconnect. */ + if (spdk_unlikely(nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DESTROYING)) { + if (req->parent != NULL) { + nvme_request_remove_child(req->parent, req); + } + nvme_free_request(req); + return -ENXIO; + } + + if (spdk_unlikely(!STAILQ_EMPTY(&qpair->queued_req) && req->num_children == 0)) { + /* + * requests that have no children should be sent to the transport after all + * currently queued requests. Requests with chilren will be split and go back + * through this path. + */ + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + req->queued = true; + return 0; + } + + rc = _nvme_qpair_submit_request(qpair, req); + if (rc == -EAGAIN) { + STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); + req->queued = true; + rc = 0; + } + + return rc; +} + +static int +nvme_qpair_resubmit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + int rc; + + /* + * We should never have a request with children on the queue. + * This is necessary to preserve the 1:1 relationship between + * completions and resubmissions. + */ + assert(req->num_children == 0); + assert(req->queued); + rc = _nvme_qpair_submit_request(qpair, req); + if (spdk_unlikely(rc == -EAGAIN)) { + STAILQ_INSERT_HEAD(&qpair->queued_req, req, stailq); + } + + return rc; +} + +void +nvme_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + nvme_qpair_complete_error_reqs(qpair); + _nvme_qpair_abort_queued_reqs(qpair, dnr); + _nvme_qpair_complete_abort_queued_reqs(qpair); + nvme_transport_qpair_abort_reqs(qpair, dnr); +} + +int +spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc, bool do_not_submit, + uint64_t timeout_in_us, + uint32_t err_count, + uint8_t sct, uint8_t sc) +{ + struct nvme_error_cmd *entry, *cmd = NULL; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) { + if (entry->opc == opc) { + cmd = entry; + break; + } + } + + if (cmd == NULL) { + cmd = spdk_zmalloc(sizeof(*cmd), 64, NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); + if (!cmd) { + return -ENOMEM; + } + TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link); + } + + cmd->do_not_submit = do_not_submit; + cmd->err_count = err_count; + cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL; + cmd->opc = opc; + cmd->status.sct = sct; + cmd->status.sc = sc; + + return 0; +} + +void +spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair, + uint8_t opc) +{ + struct nvme_error_cmd *cmd, *entry; + + if (qpair == NULL) { + qpair = ctrlr->adminq; + } + + TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) { + if (cmd->opc == opc) { + TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link); + spdk_free(cmd); + return; + } + } + + return; +} diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c new file mode 100644 index 000000000..38c8f0eae --- /dev/null +++ b/src/spdk/lib/nvme/nvme_quirks.c @@ -0,0 +1,155 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "nvme_internal.h" + +struct nvme_quirk { + struct spdk_pci_id id; + uint64_t flags; +}; + +static const struct nvme_quirk nvme_quirks[] = { + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_DELAY_BEFORE_INIT | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_DELAY_BEFORE_INIT | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_INTEL_QUIRK_READ_LATENCY | + NVME_INTEL_QUIRK_WRITE_LATENCY | + NVME_INTEL_QUIRK_STRIPING | + NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE | + NVME_QUIRK_MINIMUM_IO_QUEUE_SIZE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_BEFORE_CHK_RDY + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_INTEL_QUIRK_NO_LOG_PAGES | + NVME_QUIRK_MAXIMUM_PCI_ACCESS_WIDTH + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_IDENTIFY_CNS | + NVME_QUIRK_OCSSD + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_VMWARE, 0x07f0, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_SHST_COMPLETE + }, + { {SPDK_PCI_CLASS_NVME, SPDK_PCI_VID_INTEL, 0x2700, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID}, + NVME_QUIRK_OACS_SECURITY + }, + { {0x000000, 0x0000, 0x0000, 0x0000, 0x0000}, 0} +}; + +/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */ +static bool +pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2) +{ + if ((s1->class_id == SPDK_PCI_CLASS_ANY_ID || s1->class_id == s2->class_id) && + (s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) && + (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) && + (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) && + (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) { + return true; + } + return false; +} + +uint64_t +nvme_get_quirks(const struct spdk_pci_id *id) +{ + const struct nvme_quirk *quirk = nvme_quirks; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n", + id->vendor_id, id->device_id, + id->subvendor_id, id->subdevice_id); + + while (quirk->id.vendor_id) { + if (pci_id_match(&quirk->id, id)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n", + quirk->id.vendor_id, quirk->id.device_id, + quirk->id.subvendor_id, quirk->id.subdevice_id); + +#define PRINT_QUIRK(quirk_flag) \ + do { \ + if (quirk->flags & (quirk_flag)) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \ + } \ + } while (0) + + PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY); + PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY); + PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY); + PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING); + PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC); + PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE); + PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS); + PRINT_QUIRK(NVME_QUIRK_OCSSD); + + return quirk->flags; + } + quirk++; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n"); + + return 0; +} diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c new file mode 100644 index 000000000..84537c4a1 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_rdma.c @@ -0,0 +1,2852 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2019, 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over RDMA transport + */ + +#include "spdk/stdinc.h" + +#include "spdk/assert.h" +#include "spdk/log.h" +#include "spdk/trace.h" +#include "spdk/queue.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/config.h" + +#include "nvme_internal.h" +#include "spdk_internal/rdma.h" + +#define NVME_RDMA_TIME_OUT_IN_MS 2000 +#define NVME_RDMA_RW_BUFFER_SIZE 131072 + +/* + * NVME RDMA qpair Resource Defaults + */ +#define NVME_RDMA_DEFAULT_TX_SGE 2 +#define NVME_RDMA_DEFAULT_RX_SGE 1 + +/* Max number of NVMe-oF SGL descriptors supported by the host */ +#define NVME_RDMA_MAX_SGL_DESCRIPTORS 16 + +/* number of STAILQ entries for holding pending RDMA CM events. */ +#define NVME_RDMA_NUM_CM_EVENTS 256 + +/* CM event processing timeout */ +#define NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US 1000000 + +/* The default size for a shared rdma completion queue. */ +#define DEFAULT_NVME_RDMA_CQ_SIZE 4096 + +/* + * In the special case of a stale connection we don't expose a mechanism + * for the user to retry the connection so we need to handle it internally. + */ +#define NVME_RDMA_STALE_CONN_RETRY_MAX 5 +#define NVME_RDMA_STALE_CONN_RETRY_DELAY_US 10000 + +/* + * Maximum value of transport_retry_count used by RDMA controller + */ +#define NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT 7 + +/* + * Maximum value of transport_ack_timeout used by RDMA controller + */ +#define NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT 31 + +/* + * Number of poller cycles to keep a pointer to destroyed qpairs + * in the poll group. + */ +#define NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES 50 + +/* + * The max length of keyed SGL data block (3 bytes) + */ +#define NVME_RDMA_MAX_KEYED_SGL_LENGTH ((1u << 24u) - 1) + +#define WC_PER_QPAIR(queue_depth) (queue_depth * 2) + +enum nvme_rdma_wr_type { + RDMA_WR_TYPE_RECV, + RDMA_WR_TYPE_SEND, +}; + +struct nvme_rdma_wr { + /* Using this instead of the enum allows this struct to only occupy one byte. */ + uint8_t type; +}; + +struct spdk_nvmf_cmd { + struct spdk_nvme_cmd cmd; + struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS]; +}; + +struct spdk_nvme_rdma_hooks g_nvme_hooks = {}; + +/* Mapping from virtual address to ibv_mr pointer for a protection domain */ +struct spdk_nvme_rdma_mr_map { + struct ibv_pd *pd; + struct spdk_mem_map *map; + uint64_t ref; + LIST_ENTRY(spdk_nvme_rdma_mr_map) link; +}; + +/* STAILQ wrapper for cm events. */ +struct nvme_rdma_cm_event_entry { + struct rdma_cm_event *evt; + STAILQ_ENTRY(nvme_rdma_cm_event_entry) link; +}; + +/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */ +struct nvme_rdma_ctrlr { + struct spdk_nvme_ctrlr ctrlr; + + struct ibv_pd *pd; + + uint16_t max_sge; + + struct rdma_event_channel *cm_channel; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) pending_cm_events; + + STAILQ_HEAD(, nvme_rdma_cm_event_entry) free_cm_events; + + struct nvme_rdma_cm_event_entry *cm_events; +}; + +struct nvme_rdma_destroyed_qpair { + struct nvme_rdma_qpair *destroyed_qpair_tracker; + uint32_t completed_cycles; + STAILQ_ENTRY(nvme_rdma_destroyed_qpair) link; +}; + +struct nvme_rdma_poller { + struct ibv_context *device; + struct ibv_cq *cq; + int required_num_wc; + int current_num_wc; + STAILQ_ENTRY(nvme_rdma_poller) link; +}; + +struct nvme_rdma_poll_group { + struct spdk_nvme_transport_poll_group group; + STAILQ_HEAD(, nvme_rdma_poller) pollers; + int num_pollers; + STAILQ_HEAD(, nvme_rdma_destroyed_qpair) destroyed_qpairs; +}; + +struct spdk_nvme_send_wr_list { + struct ibv_send_wr *first; + struct ibv_send_wr *last; +}; + +struct spdk_nvme_recv_wr_list { + struct ibv_recv_wr *first; + struct ibv_recv_wr *last; +}; + +/* Memory regions */ +union nvme_rdma_mr { + struct ibv_mr *mr; + uint64_t key; +}; + +/* NVMe RDMA qpair extensions for spdk_nvme_qpair */ +struct nvme_rdma_qpair { + struct spdk_nvme_qpair qpair; + + struct spdk_rdma_qp *rdma_qp; + struct rdma_cm_id *cm_id; + struct ibv_cq *cq; + + struct spdk_nvme_rdma_req *rdma_reqs; + + uint32_t max_send_sge; + + uint32_t max_recv_sge; + + uint16_t num_entries; + + bool delay_cmd_submit; + + bool poll_group_disconnect_in_progress; + + uint32_t num_completions; + + /* Parallel arrays of response buffers + response SGLs of size num_entries */ + struct ibv_sge *rsp_sgls; + struct spdk_nvme_rdma_rsp *rsps; + + struct ibv_recv_wr *rsp_recv_wrs; + + struct spdk_nvme_send_wr_list sends_to_post; + struct spdk_nvme_recv_wr_list recvs_to_post; + + /* Memory region describing all rsps for this qpair */ + union nvme_rdma_mr rsp_mr; + + /* + * Array of num_entries NVMe commands registered as RDMA message buffers. + * Indexed by rdma_req->id. + */ + struct spdk_nvmf_cmd *cmds; + + /* Memory region describing all cmds for this qpair */ + union nvme_rdma_mr cmd_mr; + + struct spdk_nvme_rdma_mr_map *mr_map; + + TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs; + TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs; + + /* Counts of outstanding send and recv objects */ + uint16_t current_num_recvs; + uint16_t current_num_sends; + + /* Placed at the end of the struct since it is not used frequently */ + struct rdma_cm_event *evt; + + /* Used by poll group to keep the qpair around until it is ready to remove it. */ + bool defer_deletion_to_pg; +}; + +enum NVME_RDMA_COMPLETION_FLAGS { + NVME_RDMA_SEND_COMPLETED = 1u << 0, + NVME_RDMA_RECV_COMPLETED = 1u << 1, +}; + +struct spdk_nvme_rdma_req { + uint16_t id; + uint16_t completion_flags: 2; + uint16_t reserved: 14; + /* if completion of RDMA_RECV received before RDMA_SEND, we will complete nvme request + * during processing of RDMA_SEND. To complete the request we must know the index + * of nvme_cpl received in RDMA_RECV, so store it in this field */ + uint16_t rsp_idx; + + struct nvme_rdma_wr rdma_wr; + + struct ibv_send_wr send_wr; + + struct nvme_request *req; + + struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE]; + + TAILQ_ENTRY(spdk_nvme_rdma_req) link; +}; + +enum nvme_rdma_key_type { + NVME_RDMA_MR_RKEY, + NVME_RDMA_MR_LKEY +}; + +struct spdk_nvme_rdma_rsp { + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair; + uint16_t idx; + struct nvme_rdma_wr rdma_wr; +}; + +static const char *rdma_cm_event_str[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; + +static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps); +static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER; +struct nvme_rdma_qpair *nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, + uint32_t qp_num); + +static inline void * +nvme_rdma_calloc(size_t nmemb, size_t size) +{ + if (!g_nvme_hooks.get_rkey) { + return calloc(nmemb, size); + } else { + return spdk_zmalloc(nmemb * size, 0, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + } +} + +static inline void +nvme_rdma_free(void *buf) +{ + if (!g_nvme_hooks.get_rkey) { + free(buf); + } else { + spdk_free(buf); + } +} + +static int nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, + struct spdk_nvme_qpair *qpair); + +static inline struct nvme_rdma_qpair * +nvme_rdma_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair); +} + +static inline struct nvme_rdma_poll_group * +nvme_rdma_poll_group(struct spdk_nvme_transport_poll_group *group) +{ + return (SPDK_CONTAINEROF(group, struct nvme_rdma_poll_group, group)); +} + +static inline struct nvme_rdma_ctrlr * +nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA); + return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr); +} + +static struct spdk_nvme_rdma_req * +nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_req *rdma_req; + + rdma_req = TAILQ_FIRST(&rqpair->free_reqs); + if (rdma_req) { + TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link); + TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link); + } + + return rdma_req; +} + +static void +nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) +{ + rdma_req->completion_flags = 0; + rdma_req->req = NULL; + TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link); +} + +static void +nvme_rdma_req_complete(struct spdk_nvme_rdma_req *rdma_req, + struct spdk_nvme_cpl *rsp) +{ + struct nvme_request *req = rdma_req->req; + struct nvme_rdma_qpair *rqpair; + + assert(req != NULL); + + rqpair = nvme_rdma_qpair(req->qpair); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); + nvme_free_request(req); +} + +static const char * +nvme_rdma_cm_event_str_get(uint32_t event) +{ + if (event < SPDK_COUNTOF(rdma_cm_event_str)) { + return rdma_cm_event_str[event]; + } else { + return "Undefined"; + } +} + + +static int +nvme_rdma_qpair_process_cm_event(struct nvme_rdma_qpair *rqpair) +{ + struct rdma_cm_event *event = rqpair->evt; + struct spdk_nvmf_rdma_accept_private_data *accept_data; + int rc = 0; + + if (event) { + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + break; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + rc = spdk_rdma_qp_complete_connect(rqpair->rdma_qp); + /* fall through */ + case RDMA_CM_EVENT_ESTABLISHED: + accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data; + if (accept_data == NULL) { + rc = -1; + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n", + rqpair->num_entries, accept_data->crqsize); + rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize); + } + break; + case RDMA_CM_EVENT_DISCONNECTED: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_LOCAL; + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + break; + default: + SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); + break; + } + rqpair->evt = NULL; + rdma_ack_cm_event(event); + } + + return rc; +} + +/* + * This function must be called under the nvme controller's lock + * because it touches global controller variables. The lock is taken + * by the generic transport code before invoking a few of the functions + * in this file: nvme_rdma_ctrlr_connect_qpair, nvme_rdma_ctrlr_delete_io_qpair, + * and conditionally nvme_rdma_qpair_process_completions when it is calling + * completions on the admin qpair. When adding a new call to this function, please + * verify that it is in a situation where it falls under the lock. + */ +static int +nvme_rdma_poll_events(struct nvme_rdma_ctrlr *rctrlr) +{ + struct nvme_rdma_cm_event_entry *entry, *tmp; + struct nvme_rdma_qpair *event_qpair; + struct rdma_cm_event *event; + struct rdma_event_channel *channel = rctrlr->cm_channel; + + STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { + event_qpair = nvme_rdma_qpair(entry->evt->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = entry->evt; + STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); + STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); + } + } + + while (rdma_get_cm_event(channel, &event) == 0) { + event_qpair = nvme_rdma_qpair(event->id->context); + if (event_qpair->evt == NULL) { + event_qpair->evt = event; + } else { + assert(rctrlr == nvme_rdma_ctrlr(event_qpair->qpair.ctrlr)); + entry = STAILQ_FIRST(&rctrlr->free_cm_events); + if (entry == NULL) { + rdma_ack_cm_event(event); + return -ENOMEM; + } + STAILQ_REMOVE(&rctrlr->free_cm_events, entry, nvme_rdma_cm_event_entry, link); + entry->evt = event; + STAILQ_INSERT_TAIL(&rctrlr->pending_cm_events, entry, link); + } + } + + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + return errno; + } +} + +static int +nvme_rdma_validate_cm_event(enum rdma_cm_event_type expected_evt_type, + struct rdma_cm_event *reaped_evt) +{ + int rc = -EBADMSG; + + if (expected_evt_type == reaped_evt->event) { + return 0; + } + + switch (expected_evt_type) { + case RDMA_CM_EVENT_ESTABLISHED: + /* + * There is an enum ib_cm_rej_reason in the kernel headers that sets 10 as + * IB_CM_REJ_STALE_CONN. I can't find the corresponding userspace but we get + * the same values here. + */ + if (reaped_evt->event == RDMA_CM_EVENT_REJECTED && reaped_evt->status == 10) { + rc = -ESTALE; + } else if (reaped_evt->event == RDMA_CM_EVENT_CONNECT_RESPONSE) { + /* + * If we are using a qpair which is not created using rdma cm API + * then we will receive RDMA_CM_EVENT_CONNECT_RESPONSE instead of + * RDMA_CM_EVENT_ESTABLISHED. + */ + return 0; + } + break; + default: + break; + } + + SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n", + nvme_rdma_cm_event_str_get(expected_evt_type), + nvme_rdma_cm_event_str_get(reaped_evt->event), reaped_evt->event, + reaped_evt->status); + return rc; +} + +static int +nvme_rdma_process_event(struct nvme_rdma_qpair *rqpair, + struct rdma_event_channel *channel, + enum rdma_cm_event_type evt) +{ + struct nvme_rdma_ctrlr *rctrlr; + uint64_t timeout_ticks; + int rc = 0, rc2; + + if (rqpair->evt != NULL) { + rc = nvme_rdma_qpair_process_cm_event(rqpair); + if (rc) { + return rc; + } + } + + timeout_ticks = (NVME_RDMA_QPAIR_CM_EVENT_TIMEOUT_US * spdk_get_ticks_hz()) / SPDK_SEC_TO_USEC + + spdk_get_ticks(); + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + assert(rctrlr != NULL); + + while (!rqpair->evt && spdk_get_ticks() < timeout_ticks && rc == 0) { + rc = nvme_rdma_poll_events(rctrlr); + } + + if (rc) { + return rc; + } + + if (rqpair->evt == NULL) { + return -EADDRNOTAVAIL; + } + + rc = nvme_rdma_validate_cm_event(evt, rqpair->evt); + + rc2 = nvme_rdma_qpair_process_cm_event(rqpair); + /* bad message takes precedence over the other error codes from processing the event. */ + return rc == 0 ? rc2 : rc; +} + +static int +nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair) +{ + int rc; + struct spdk_rdma_qp_init_attr attr = {}; + struct ibv_device_attr dev_attr; + struct nvme_rdma_ctrlr *rctrlr; + + rc = ibv_query_device(rqpair->cm_id->verbs, &dev_attr); + if (rc != 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + return -1; + } + + if (rqpair->qpair.poll_group) { + assert(!rqpair->cq); + rc = nvme_poll_group_connect_qpair(&rqpair->qpair); + if (rc) { + SPDK_ERRLOG("Unable to activate the rdmaqpair.\n"); + return -1; + } + assert(rqpair->cq); + } else { + rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0); + if (!rqpair->cq) { + SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + } + + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + if (g_nvme_hooks.get_ibv_pd) { + rctrlr->pd = g_nvme_hooks.get_ibv_pd(&rctrlr->ctrlr.trid, rqpair->cm_id->verbs); + } else { + rctrlr->pd = NULL; + } + + attr.pd = rctrlr->pd; + attr.send_cq = rqpair->cq; + attr.recv_cq = rqpair->cq; + attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */ + attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */ + attr.cap.max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, dev_attr.max_sge); + attr.cap.max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, dev_attr.max_sge); + + rqpair->rdma_qp = spdk_rdma_qp_create(rqpair->cm_id, &attr); + + if (!rqpair->rdma_qp) { + return -1; + } + + /* ibv_create_qp will change the values in attr.cap. Make sure we store the proper value. */ + rqpair->max_send_sge = spdk_min(NVME_RDMA_DEFAULT_TX_SGE, attr.cap.max_send_sge); + rqpair->max_recv_sge = spdk_min(NVME_RDMA_DEFAULT_RX_SGE, attr.cap.max_recv_sge); + rqpair->current_num_recvs = 0; + rqpair->current_num_sends = 0; + + rctrlr->pd = rqpair->rdma_qp->qp->pd; + + rqpair->cm_id->context = &rqpair->qpair; + + return 0; +} + +static inline int +nvme_rdma_qpair_submit_sends(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_send_wr *bad_send_wr; + int rc; + + rc = spdk_rdma_qp_flush_send_wrs(rqpair->rdma_qp, &bad_send_wr); + + if (spdk_unlikely(rc)) { + SPDK_ERRLOG("Failed to post WRs on send queue, errno %d (%s), bad_wr %p\n", + rc, spdk_strerror(rc), bad_send_wr); + while (bad_send_wr != NULL) { + assert(rqpair->current_num_sends > 0); + rqpair->current_num_sends--; + bad_send_wr = bad_send_wr->next; + } + return rc; + } + + return 0; +} + +static inline int +nvme_rdma_qpair_submit_recvs(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_recv_wr *bad_recv_wr; + int rc = 0; + + if (rqpair->recvs_to_post.first) { + rc = ibv_post_recv(rqpair->rdma_qp->qp, rqpair->recvs_to_post.first, &bad_recv_wr); + if (spdk_unlikely(rc)) { + SPDK_ERRLOG("Failed to post WRs on receive queue, errno %d (%s), bad_wr %p\n", + rc, spdk_strerror(rc), bad_recv_wr); + while (bad_recv_wr != NULL) { + assert(rqpair->current_num_sends > 0); + rqpair->current_num_recvs--; + bad_recv_wr = bad_recv_wr->next; + } + } + + rqpair->recvs_to_post.first = NULL; + } + return rc; +} + +/* Append the given send wr structure to the qpair's outstanding sends list. */ +/* This function accepts only a single wr. */ +static inline int +nvme_rdma_qpair_queue_send_wr(struct nvme_rdma_qpair *rqpair, struct ibv_send_wr *wr) +{ + assert(wr->next == NULL); + + assert(rqpair->current_num_sends < rqpair->num_entries); + + rqpair->current_num_sends++; + spdk_rdma_qp_queue_send_wrs(rqpair->rdma_qp, wr); + + if (!rqpair->delay_cmd_submit) { + return nvme_rdma_qpair_submit_sends(rqpair); + } + + return 0; +} + +/* Append the given recv wr structure to the qpair's outstanding recvs list. */ +/* This function accepts only a single wr. */ +static inline int +nvme_rdma_qpair_queue_recv_wr(struct nvme_rdma_qpair *rqpair, struct ibv_recv_wr *wr) +{ + + assert(wr->next == NULL); + assert(rqpair->current_num_recvs < rqpair->num_entries); + + rqpair->current_num_recvs++; + if (rqpair->recvs_to_post.first == NULL) { + rqpair->recvs_to_post.first = wr; + } else { + rqpair->recvs_to_post.last->next = wr; + } + + rqpair->recvs_to_post.last = wr; + + if (!rqpair->delay_cmd_submit) { + return nvme_rdma_qpair_submit_recvs(rqpair); + } + + return 0; +} + +#define nvme_rdma_trace_ibv_sge(sg_list) \ + if (sg_list) { \ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \ + (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \ + } + +static int +nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx) +{ + struct ibv_recv_wr *wr; + + wr = &rqpair->rsp_recv_wrs[rsp_idx]; + wr->next = NULL; + nvme_rdma_trace_ibv_sge(wr->sg_list); + return nvme_rdma_qpair_queue_recv_wr(rqpair, wr); +} + +static int +nvme_rdma_reg_mr(struct rdma_cm_id *cm_id, union nvme_rdma_mr *mr, void *mem, size_t length) +{ + if (!g_nvme_hooks.get_rkey) { + mr->mr = rdma_reg_msgs(cm_id, mem, length); + if (mr->mr == NULL) { + SPDK_ERRLOG("Unable to register mr: %s (%d)\n", + spdk_strerror(errno), errno); + return -1; + } + } else { + mr->key = g_nvme_hooks.get_rkey(cm_id->pd, mem, length); + } + + return 0; +} + +static void +nvme_rdma_dereg_mr(union nvme_rdma_mr *mr) +{ + if (!g_nvme_hooks.get_rkey) { + if (mr->mr && rdma_dereg_mr(mr->mr)) { + SPDK_ERRLOG("Unable to de-register mr\n"); + } + } else { + if (mr->key) { + g_nvme_hooks.put_rkey(mr->key); + } + } + memset(mr, 0, sizeof(*mr)); +} + +static uint32_t +nvme_rdma_mr_get_lkey(union nvme_rdma_mr *mr) +{ + uint32_t lkey; + + if (!g_nvme_hooks.get_rkey) { + lkey = mr->mr->lkey; + } else { + lkey = *((uint64_t *) mr->key); + } + + return lkey; +} + +static void +nvme_rdma_unregister_rsps(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_dereg_mr(&rqpair->rsp_mr); +} + +static void +nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_free(rqpair->rsps); + rqpair->rsps = NULL; + nvme_rdma_free(rqpair->rsp_sgls); + rqpair->rsp_sgls = NULL; + nvme_rdma_free(rqpair->rsp_recv_wrs); + rqpair->rsp_recv_wrs = NULL; +} + +static int +nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair) +{ + rqpair->rsps = NULL; + rqpair->rsp_recv_wrs = NULL; + + rqpair->rsp_sgls = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls)); + if (!rqpair->rsp_sgls) { + SPDK_ERRLOG("Failed to allocate rsp_sgls\n"); + goto fail; + } + + rqpair->rsp_recv_wrs = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsp_recv_wrs)); + if (!rqpair->rsp_recv_wrs) { + SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n"); + goto fail; + } + + rqpair->rsps = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->rsps)); + if (!rqpair->rsps) { + SPDK_ERRLOG("can not allocate rdma rsps\n"); + goto fail; + } + + return 0; +fail: + nvme_rdma_free_rsps(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_register_rsps(struct nvme_rdma_qpair *rqpair) +{ + uint16_t i; + int rc; + uint32_t lkey; + + rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->rsp_mr, + rqpair->rsps, rqpair->num_entries * sizeof(*rqpair->rsps)); + + if (rc < 0) { + goto fail; + } + + lkey = nvme_rdma_mr_get_lkey(&rqpair->rsp_mr); + + for (i = 0; i < rqpair->num_entries; i++) { + struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i]; + struct spdk_nvme_rdma_rsp *rsp = &rqpair->rsps[i]; + + rsp->rqpair = rqpair; + rsp->rdma_wr.type = RDMA_WR_TYPE_RECV; + rsp->idx = i; + rsp_sgl->addr = (uint64_t)&rqpair->rsps[i]; + rsp_sgl->length = sizeof(struct spdk_nvme_cpl); + rsp_sgl->lkey = lkey; + + rqpair->rsp_recv_wrs[i].wr_id = (uint64_t)&rsp->rdma_wr; + rqpair->rsp_recv_wrs[i].next = NULL; + rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl; + rqpair->rsp_recv_wrs[i].num_sge = 1; + + rc = nvme_rdma_post_recv(rqpair, i); + if (rc) { + goto fail; + } + } + + rc = nvme_rdma_qpair_submit_recvs(rqpair); + if (rc) { + goto fail; + } + + return 0; + +fail: + nvme_rdma_unregister_rsps(rqpair); + return rc; +} + +static void +nvme_rdma_unregister_reqs(struct nvme_rdma_qpair *rqpair) +{ + nvme_rdma_dereg_mr(&rqpair->cmd_mr); +} + +static void +nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair) +{ + if (!rqpair->rdma_reqs) { + return; + } + + nvme_rdma_free(rqpair->cmds); + rqpair->cmds = NULL; + + nvme_rdma_free(rqpair->rdma_reqs); + rqpair->rdma_reqs = NULL; +} + +static int +nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair) +{ + uint16_t i; + + rqpair->rdma_reqs = nvme_rdma_calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req)); + if (rqpair->rdma_reqs == NULL) { + SPDK_ERRLOG("Failed to allocate rdma_reqs\n"); + goto fail; + } + + rqpair->cmds = nvme_rdma_calloc(rqpair->num_entries, sizeof(*rqpair->cmds)); + if (!rqpair->cmds) { + SPDK_ERRLOG("Failed to allocate RDMA cmds\n"); + goto fail; + } + + + TAILQ_INIT(&rqpair->free_reqs); + TAILQ_INIT(&rqpair->outstanding_reqs); + for (i = 0; i < rqpair->num_entries; i++) { + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvmf_cmd *cmd; + + rdma_req = &rqpair->rdma_reqs[i]; + rdma_req->rdma_wr.type = RDMA_WR_TYPE_SEND; + cmd = &rqpair->cmds[i]; + + rdma_req->id = i; + + /* The first RDMA sgl element will always point + * at this data structure. Depending on whether + * an NVMe-oF SGL is required, the length of + * this element may change. */ + rdma_req->send_sgl[0].addr = (uint64_t)cmd; + rdma_req->send_wr.wr_id = (uint64_t)&rdma_req->rdma_wr; + rdma_req->send_wr.next = NULL; + rdma_req->send_wr.opcode = IBV_WR_SEND; + rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->send_wr.sg_list = rdma_req->send_sgl; + rdma_req->send_wr.imm_data = 0; + + TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link); + } + + return 0; +fail: + nvme_rdma_free_reqs(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_register_reqs(struct nvme_rdma_qpair *rqpair) +{ + int i; + int rc; + uint32_t lkey; + + rc = nvme_rdma_reg_mr(rqpair->cm_id, &rqpair->cmd_mr, + rqpair->cmds, rqpair->num_entries * sizeof(*rqpair->cmds)); + + if (rc < 0) { + goto fail; + } + + lkey = nvme_rdma_mr_get_lkey(&rqpair->cmd_mr); + + for (i = 0; i < rqpair->num_entries; i++) { + rqpair->rdma_reqs[i].send_sgl[0].lkey = lkey; + } + + return 0; + +fail: + nvme_rdma_unregister_reqs(rqpair); + return -ENOMEM; +} + +static int +nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, + struct rdma_event_channel *cm_channel) +{ + int ret; + + ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr, + NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED); + if (ret) { + SPDK_ERRLOG("RDMA address resolution error\n"); + return -1; + } + + if (rqpair->qpair.ctrlr->opts.transport_ack_timeout != SPDK_NVME_TRANSPORT_ACK_TIMEOUT_DISABLED) { +#ifdef SPDK_CONFIG_RDMA_SET_ACK_TIMEOUT + uint8_t timeout = rqpair->qpair.ctrlr->opts.transport_ack_timeout; + ret = rdma_set_option(rqpair->cm_id, RDMA_OPTION_ID, + RDMA_OPTION_ID_ACK_TIMEOUT, + &timeout, sizeof(timeout)); + if (ret) { + SPDK_NOTICELOG("Can't apply RDMA_OPTION_ID_ACK_TIMEOUT %d, ret %d\n", timeout, ret); + } +#else + SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport_ack_timeout is not supported\n"); +#endif + } + + + ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS); + if (ret) { + SPDK_ERRLOG("rdma_resolve_route\n"); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED); + if (ret) { + SPDK_ERRLOG("RDMA route resolution error\n"); + return -1; + } + + return 0; +} + +static int +nvme_rdma_connect(struct nvme_rdma_qpair *rqpair) +{ + struct rdma_conn_param param = {}; + struct spdk_nvmf_rdma_request_private_data request_data = {}; + struct ibv_device_attr attr; + int ret; + struct spdk_nvme_ctrlr *ctrlr; + struct nvme_rdma_ctrlr *rctrlr; + + ret = ibv_query_device(rqpair->cm_id->verbs, &attr); + if (ret != 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + return ret; + } + + param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom); + + ctrlr = rqpair->qpair.ctrlr; + if (!ctrlr) { + return -1; + } + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); + + request_data.qid = rqpair->qpair.id; + request_data.hrqsize = rqpair->num_entries; + request_data.hsqsize = rqpair->num_entries - 1; + request_data.cntlid = ctrlr->cntlid; + + param.private_data = &request_data; + param.private_data_len = sizeof(request_data); + param.retry_count = ctrlr->opts.transport_retry_count; + param.rnr_retry_count = 7; + + /* Fields below are ignored by rdma cm if qpair has been + * created using rdma cm API. */ + param.srq = 0; + param.qp_num = rqpair->rdma_qp->qp->qp_num; + + ret = rdma_connect(rqpair->cm_id, ¶m); + if (ret) { + SPDK_ERRLOG("nvme rdma connect error\n"); + return ret; + } + + ret = nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_ESTABLISHED); + if (ret == -ESTALE) { + SPDK_NOTICELOG("Received a stale connection notice during connection.\n"); + return -EAGAIN; + } else if (ret) { + SPDK_ERRLOG("RDMA connect error %d\n", ret); + return ret; + } else { + return 0; + } +} + +static int +nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) +{ + struct addrinfo *res; + struct addrinfo hints; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + ret = getaddrinfo(addr, service, &hints, &res); + if (ret) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); + return ret; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); + ret = EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + + freeaddrinfo(res); + return ret; +} + +static int +nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct ibv_pd *pd = cb_ctx; + struct ibv_mr *mr; + int rc; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + if (!g_nvme_hooks.get_rkey) { + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -EFAULT; + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + } else { + rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, + g_nvme_hooks.get_rkey(pd, vaddr, size)); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + if (!g_nvme_hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + if (mr) { + ibv_dereg_mr(mr); + } + } + rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + break; + default: + SPDK_UNREACHABLE(); + } + + return rc; +} + +static int +nvme_rdma_check_contiguous_entries(uint64_t addr_1, uint64_t addr_2) +{ + /* Two contiguous mappings will point to the same address which is the start of the RDMA MR. */ + return addr_1 == addr_2; +} + +static int +nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair) +{ + struct ibv_pd *pd = rqpair->rdma_qp->qp->pd; + struct spdk_nvme_rdma_mr_map *mr_map; + const struct spdk_mem_map_ops nvme_rdma_map_ops = { + .notify_cb = nvme_rdma_mr_map_notify, + .are_contiguous = nvme_rdma_check_contiguous_entries + }; + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + /* Look up existing mem map registration for this pd */ + LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) { + if (mr_map->pd == pd) { + mr_map->ref++; + rqpair->mr_map = mr_map; + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return 0; + } + } + + mr_map = nvme_rdma_calloc(1, sizeof(*mr_map)); + if (mr_map == NULL) { + SPDK_ERRLOG("Failed to allocate mr_map\n"); + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + mr_map->ref = 1; + mr_map->pd = pd; + mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd); + if (mr_map->map == NULL) { + SPDK_ERRLOG("spdk_mem_map_alloc() failed\n"); + nvme_rdma_free(mr_map); + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + return -1; + } + + rqpair->mr_map = mr_map; + LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link); + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); + + return 0; +} + +static void +nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair) +{ + struct spdk_nvme_rdma_mr_map *mr_map; + + mr_map = rqpair->mr_map; + rqpair->mr_map = NULL; + + if (mr_map == NULL) { + return; + } + + pthread_mutex_lock(&g_rdma_mr_maps_mutex); + + assert(mr_map->ref > 0); + mr_map->ref--; + if (mr_map->ref == 0) { + LIST_REMOVE(mr_map, link); + spdk_mem_map_free(&mr_map->map); + nvme_rdma_free(mr_map); + } + + pthread_mutex_unlock(&g_rdma_mr_maps_mutex); +} + +static int +_nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + bool src_addr_specified; + int rc; + struct nvme_rdma_ctrlr *rctrlr; + struct nvme_rdma_qpair *rqpair; + int family; + + rqpair = nvme_rdma_qpair(qpair); + rctrlr = nvme_rdma_ctrlr(ctrlr); + assert(rctrlr != NULL); + + switch (ctrlr->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); + + memset(&dst_addr, 0, sizeof(dst_addr)); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); + rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); + if (rc != 0) { + SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + + if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { + memset(&src_addr, 0, sizeof(src_addr)); + rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); + if (rc != 0) { + SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n"); + return -1; + } + src_addr_specified = true; + } else { + src_addr_specified = false; + } + + rc = rdma_create_id(rctrlr->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + return -1; + } + + rc = nvme_rdma_resolve_addr(rqpair, + src_addr_specified ? (struct sockaddr *)&src_addr : NULL, + (struct sockaddr *)&dst_addr, rctrlr->cm_channel); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n"); + return -1; + } + + rc = nvme_rdma_qpair_init(rqpair); + if (rc < 0) { + SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n"); + return -1; + } + + rc = nvme_rdma_connect(rqpair); + if (rc != 0) { + SPDK_ERRLOG("Unable to connect the rqpair\n"); + return rc; + } + + rc = nvme_rdma_register_reqs(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc) { + SPDK_ERRLOG("Unable to register rqpair RDMA requests\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests registered\n"); + + rc = nvme_rdma_register_rsps(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc < 0) { + SPDK_ERRLOG("Unable to register rqpair RDMA responses\n"); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses registered\n"); + + rc = nvme_rdma_register_mem(rqpair); + if (rc < 0) { + SPDK_ERRLOG("Unable to register memory for RDMA\n"); + return -1; + } + + rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries); + if (rc < 0) { + rqpair->qpair.transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); + return -1; + } + + return 0; +} + +static int +nvme_rdma_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + int rc; + int retry_count = 0; + + rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); + + /* + * -EAGAIN represents the special case where the target side still thought it was connected. + * Most NICs will fail the first connection attempt, and the NICs will clean up whatever + * state they need to. After that, subsequent connection attempts will succeed. + */ + if (rc == -EAGAIN) { + SPDK_NOTICELOG("Detected stale connection on Target side for qpid: %d\n", qpair->id); + do { + nvme_delay(NVME_RDMA_STALE_CONN_RETRY_DELAY_US); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + rc = _nvme_rdma_ctrlr_connect_qpair(ctrlr, qpair); + retry_count++; + } while (rc == -EAGAIN && retry_count < NVME_RDMA_STALE_CONN_RETRY_MAX); + } + + return rc; +} + +/* + * Build SGL describing empty payload. + */ +static int +nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = 0; + req->cmd.dptr.sgl1.keyed.key = 0; + req->cmd.dptr.sgl1.address = 0; + + return 0; +} + +static inline bool +nvme_rdma_get_key(struct spdk_mem_map *map, void *payload, uint64_t size, + enum nvme_rdma_key_type key_type, uint32_t *key) +{ + struct ibv_mr *mr; + uint64_t real_size = size; + uint32_t _key = 0; + + if (!g_nvme_hooks.get_rkey) { + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)payload, &real_size); + + if (spdk_unlikely(!mr)) { + SPDK_ERRLOG("No translation for ptr %p, size %lu\n", payload, size); + return false; + } + switch (key_type) { + case NVME_RDMA_MR_RKEY: + _key = mr->rkey; + break; + case NVME_RDMA_MR_LKEY: + _key = mr->lkey; + break; + default: + SPDK_ERRLOG("Invalid key type %d\n", key_type); + assert(0); + return false; + } + } else { + _key = spdk_mem_map_translate(map, (uint64_t)payload, &real_size); + } + + if (spdk_unlikely(real_size < size)) { + SPDK_ERRLOG("Data buffer split over multiple RDMA Memory Regions\n"); + return false; + } + + *key = _key; + return true; +} + +/* + * Build inline SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + uint32_t lkey = 0; + void *payload; + + payload = req->payload.contig_or_cb_arg + req->payload_offset; + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size, + NVME_RDMA_MR_LKEY, &lkey))) { + return -1; + } + + rdma_req->send_sgl[1].lkey = lkey; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + rdma_req->send_sgl[1].addr = (uint64_t)payload; + rdma_req->send_sgl[1].length = (uint32_t)req->payload_size; + + /* The RDMA SGL contains two elements. The first describes + * the NVMe command and the second describes the data + * payload. */ + rdma_req->send_wr.num_sge = 2; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +/* + * Build SGL describing contiguous payload buffer. + */ +static int +nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + void *payload = req->payload.contig_or_cb_arg + req->payload_offset; + uint32_t rkey = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + if (spdk_unlikely(req->payload_size > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + req->payload_size, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, payload, req->payload_size, + NVME_RDMA_MR_RKEY, &rkey))) { + return -1; + } + + req->cmd.dptr.sgl1.keyed.key = rkey; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + /* The RDMA SGL needs one element describing the NVMe command. */ + rdma_req->send_wr.num_sge = 1; + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + req->cmd.dptr.sgl1.keyed.length = req->payload_size; + req->cmd.dptr.sgl1.address = (uint64_t)payload; + + return 0; +} + +/* + * Build SGL describing scattered payload buffer. + */ +static int +nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id]; + void *virt_addr; + uint32_t remaining_size; + uint32_t sge_length; + int rc, max_num_sgl, num_sgl_desc; + uint32_t rkey = 0; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + max_num_sgl = req->qpair->ctrlr->max_sges; + + remaining_size = req->payload_size; + num_sgl_desc = 0; + do { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length); + if (rc) { + return -1; + } + + sge_length = spdk_min(remaining_size, sge_length); + + if (spdk_unlikely(sge_length > NVME_RDMA_MAX_KEYED_SGL_LENGTH)) { + SPDK_ERRLOG("SGL length %u exceeds max keyed SGL block size %u\n", + sge_length, NVME_RDMA_MAX_KEYED_SGL_LENGTH); + return -1; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, sge_length, + NVME_RDMA_MR_RKEY, &rkey))) { + return -1; + } + + cmd->sgl[num_sgl_desc].keyed.key = rkey; + cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK; + cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS; + cmd->sgl[num_sgl_desc].keyed.length = sge_length; + cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr; + + remaining_size -= sge_length; + num_sgl_desc++; + } while (remaining_size > 0 && num_sgl_desc < max_num_sgl); + + + /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ + if (remaining_size > 0) { + return -1; + } + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + + /* The RDMA SGL needs one element describing some portion + * of the spdk_nvmf_cmd structure. */ + rdma_req->send_wr.num_sge = 1; + + /* + * If only one SGL descriptor is required, it can be embedded directly in the command + * as a data block descriptor. + */ + if (num_sgl_desc == 1) { + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + req->cmd.dptr.sgl1.keyed.type = cmd->sgl[0].keyed.type; + req->cmd.dptr.sgl1.keyed.subtype = cmd->sgl[0].keyed.subtype; + req->cmd.dptr.sgl1.keyed.length = cmd->sgl[0].keyed.length; + req->cmd.dptr.sgl1.keyed.key = cmd->sgl[0].keyed.key; + req->cmd.dptr.sgl1.address = cmd->sgl[0].address; + } else { + /* + * Otherwise, The SGL descriptor embedded in the command must point to the list of + * SGL descriptors used to describe the operation. In that case it is a last segment descriptor. + */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct + spdk_nvme_sgl_descriptor) * num_sgl_desc; + + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor); + req->cmd.dptr.sgl1.address = (uint64_t)0; + } + + return 0; +} + +/* + * Build inline SGL describing sgl payload buffer. + */ +static int +nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct nvme_request *req = rdma_req->req; + uint32_t lkey = 0; + uint32_t length; + void *virt_addr; + int rc; + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length); + if (rc) { + return -1; + } + + if (length < req->payload_size) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Inline SGL request split so sending separately.\n"); + return nvme_rdma_build_sgl_request(rqpair, rdma_req); + } + + if (length > req->payload_size) { + length = req->payload_size; + } + + if (spdk_unlikely(!nvme_rdma_get_key(rqpair->mr_map->map, virt_addr, length, + NVME_RDMA_MR_LKEY, &lkey))) { + return -1; + } + + rdma_req->send_sgl[1].addr = (uint64_t)virt_addr; + rdma_req->send_sgl[1].length = length; + rdma_req->send_sgl[1].lkey = lkey; + + rdma_req->send_wr.num_sge = 2; + + /* The first element of this SGL is pointing at an + * spdk_nvmf_cmd object. For this particular command, + * we only need the first 64 bytes corresponding to + * the NVMe command. */ + rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd); + + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size; + /* Inline only supported for icdoff == 0 currently. This function will + * not get called for controllers with other values. */ + req->cmd.dptr.sgl1.address = (uint64_t)0; + + return 0; +} + +static int +nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req, + struct spdk_nvme_rdma_req *rdma_req) +{ + struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr; + enum nvme_payload_type payload_type; + bool icd_supported; + int rc; + + assert(rdma_req->req == NULL); + rdma_req->req = req; + req->cmd.cid = rdma_req->id; + payload_type = nvme_payload_type(&req->payload); + /* + * Check if icdoff is non zero, to avoid interop conflicts with + * targets with non-zero icdoff. Both SPDK and the Linux kernel + * targets use icdoff = 0. For targets with non-zero icdoff, we + * will currently just not use inline data for now. + */ + icd_supported = spdk_nvme_opc_get_data_transfer(req->cmd.opc) == SPDK_NVME_DATA_HOST_TO_CONTROLLER + && req->payload_size <= ctrlr->ioccsz_bytes && ctrlr->icdoff == 0; + + if (req->payload_size == 0) { + rc = nvme_rdma_build_null_request(rdma_req); + } else if (payload_type == NVME_PAYLOAD_TYPE_CONTIG) { + if (icd_supported) { + rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_contig_request(rqpair, rdma_req); + } + } else if (payload_type == NVME_PAYLOAD_TYPE_SGL) { + if (icd_supported) { + rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req); + } else { + rc = nvme_rdma_build_sgl_request(rqpair, rdma_req); + } + } else { + rc = -1; + } + + if (rc) { + rdma_req->req = NULL; + return rc; + } + + memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd)); + return 0; +} + +static struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, uint32_t qsize, + enum spdk_nvme_qprio qprio, + uint32_t num_requests, + bool delay_cmd_submit) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + rqpair = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_qpair)); + if (!rqpair) { + SPDK_ERRLOG("failed to get create rqpair\n"); + return NULL; + } + + rqpair->num_entries = qsize; + rqpair->delay_cmd_submit = delay_cmd_submit; + qpair = &rqpair->qpair; + rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); + if (rc != 0) { + return NULL; + } + + rc = nvme_rdma_alloc_reqs(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n"); + nvme_rdma_free(rqpair); + return NULL; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n"); + + rc = nvme_rdma_alloc_rsps(rqpair); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc); + if (rc < 0) { + SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n"); + nvme_rdma_free_reqs(rqpair); + nvme_rdma_free(rqpair); + return NULL; + } + SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n"); + + return qpair; +} + +static void +nvme_rdma_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_ctrlr *rctrlr = NULL; + struct nvme_rdma_cm_event_entry *entry, *tmp; + + nvme_rdma_unregister_mem(rqpair); + nvme_rdma_unregister_reqs(rqpair); + nvme_rdma_unregister_rsps(rqpair); + + if (rqpair->evt) { + rdma_ack_cm_event(rqpair->evt); + rqpair->evt = NULL; + } + + /* + * This works because we have the controller lock both in + * this function and in the function where we add new events. + */ + if (qpair->ctrlr != NULL) { + rctrlr = nvme_rdma_ctrlr(qpair->ctrlr); + STAILQ_FOREACH_SAFE(entry, &rctrlr->pending_cm_events, link, tmp) { + if (nvme_rdma_qpair(entry->evt->id->context) == rqpair) { + STAILQ_REMOVE(&rctrlr->pending_cm_events, entry, nvme_rdma_cm_event_entry, link); + rdma_ack_cm_event(entry->evt); + STAILQ_INSERT_HEAD(&rctrlr->free_cm_events, entry, link); + } + } + } + + if (rqpair->cm_id) { + if (rqpair->rdma_qp) { + spdk_rdma_qp_disconnect(rqpair->rdma_qp); + if (rctrlr != NULL) { + if (nvme_rdma_process_event(rqpair, rctrlr->cm_channel, RDMA_CM_EVENT_DISCONNECTED)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Target did not respond to qpair disconnect.\n"); + } + } + spdk_rdma_qp_destroy(rqpair->rdma_qp); + rqpair->rdma_qp = NULL; + } + + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + } + + if (rqpair->cq) { + ibv_destroy_cq(rqpair->cq); + rqpair->cq = NULL; + } +} + +static void nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); + +static int +nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair; + + rqpair = nvme_rdma_qpair(qpair); + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + if (rqpair->defer_deletion_to_pg) { + nvme_qpair_set_state(qpair, NVME_QPAIR_DESTROYING); + return 0; + } + + nvme_rdma_qpair_abort_reqs(qpair, 1); + nvme_qpair_deinit(qpair); + + nvme_rdma_free_reqs(rqpair); + nvme_rdma_free_rsps(rqpair); + nvme_rdma_free(rqpair); + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, + opts->io_queue_requests, + opts->delay_cmd_submit); +} + +static int +nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + /* do nothing here */ + return 0; +} + +static int nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); + +static struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_rdma_ctrlr *rctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + struct ibv_context **contexts; + struct ibv_device_attr dev_attr; + int i, flag, rc; + + rctrlr = nvme_rdma_calloc(1, sizeof(struct nvme_rdma_ctrlr)); + if (rctrlr == NULL) { + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + rctrlr->ctrlr.opts = *opts; + rctrlr->ctrlr.trid = *trid; + + if (opts->transport_retry_count > NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT) { + SPDK_NOTICELOG("transport_retry_count exceeds max value %d, use max value\n", + NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT); + rctrlr->ctrlr.opts.transport_retry_count = NVME_RDMA_CTRLR_MAX_TRANSPORT_RETRY_COUNT; + } + + if (opts->transport_ack_timeout > NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT) { + SPDK_NOTICELOG("transport_ack_timeout exceeds max value %d, use max value\n", + NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT); + rctrlr->ctrlr.opts.transport_ack_timeout = NVME_RDMA_CTRLR_MAX_TRANSPORT_ACK_TIMEOUT; + } + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + nvme_rdma_free(rctrlr); + return NULL; + } + + i = 0; + rctrlr->max_sge = NVME_RDMA_MAX_SGL_DESCRIPTORS; + + while (contexts[i] != NULL) { + rc = ibv_query_device(contexts[i], &dev_attr); + if (rc < 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + rdma_free_devices(contexts); + nvme_rdma_free(rctrlr); + return NULL; + } + rctrlr->max_sge = spdk_min(rctrlr->max_sge, (uint16_t)dev_attr.max_sge); + i++; + } + + rdma_free_devices(contexts); + + rc = nvme_ctrlr_construct(&rctrlr->ctrlr); + if (rc != 0) { + nvme_rdma_free(rctrlr); + return NULL; + } + + STAILQ_INIT(&rctrlr->pending_cm_events); + STAILQ_INIT(&rctrlr->free_cm_events); + rctrlr->cm_events = nvme_rdma_calloc(NVME_RDMA_NUM_CM_EVENTS, sizeof(*rctrlr->cm_events)); + if (rctrlr->cm_events == NULL) { + SPDK_ERRLOG("unable to allocat buffers to hold CM events.\n"); + goto destruct_ctrlr; + } + + for (i = 0; i < NVME_RDMA_NUM_CM_EVENTS; i++) { + STAILQ_INSERT_TAIL(&rctrlr->free_cm_events, &rctrlr->cm_events[i], link); + } + + rctrlr->cm_channel = rdma_create_event_channel(); + if (rctrlr->cm_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed\n"); + goto destruct_ctrlr; + } + + flag = fcntl(rctrlr->cm_channel->fd, F_GETFL); + if (fcntl(rctrlr->cm_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("Cannot set event channel to non blocking\n"); + goto destruct_ctrlr; + } + + rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0, + rctrlr->ctrlr.opts.admin_queue_size, 0, + rctrlr->ctrlr.opts.admin_queue_size, false); + if (!rctrlr->ctrlr.adminq) { + SPDK_ERRLOG("failed to create admin qpair\n"); + goto destruct_ctrlr; + } + + rc = nvme_transport_ctrlr_connect_qpair(&rctrlr->ctrlr, rctrlr->ctrlr.adminq); + if (rc < 0) { + SPDK_ERRLOG("failed to connect admin qpair\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + goto destruct_ctrlr; + } + + if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) { + SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); + goto destruct_ctrlr; + } + + nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n"); + return &rctrlr->ctrlr; + +destruct_ctrlr: + nvme_ctrlr_destruct(&rctrlr->ctrlr); + return NULL; +} + +static int +nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + struct nvme_rdma_cm_event_entry *entry; + + if (ctrlr->adminq) { + nvme_rdma_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); + } + + STAILQ_FOREACH(entry, &rctrlr->pending_cm_events, link) { + rdma_ack_cm_event(entry->evt); + } + + STAILQ_INIT(&rctrlr->free_cm_events); + STAILQ_INIT(&rctrlr->pending_cm_events); + nvme_rdma_free(rctrlr->cm_events); + + if (rctrlr->cm_channel) { + rdma_destroy_event_channel(rctrlr->cm_channel); + rctrlr->cm_channel = NULL; + } + + nvme_ctrlr_destruct_finish(ctrlr); + + nvme_rdma_free(rctrlr); + + return 0; +} + +static int +nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req) +{ + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_rdma_req *rdma_req; + struct ibv_send_wr *wr; + + rqpair = nvme_rdma_qpair(qpair); + assert(rqpair != NULL); + assert(req != NULL); + + rdma_req = nvme_rdma_req_get(rqpair); + if (!rdma_req) { + /* Inform the upper layer to try again later. */ + return -EAGAIN; + } + + if (nvme_rdma_req_init(rqpair, req, rdma_req)) { + SPDK_ERRLOG("nvme_rdma_req_init() failed\n"); + TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link); + nvme_rdma_req_put(rqpair, rdma_req); + return -1; + } + + wr = &rdma_req->send_wr; + wr->next = NULL; + nvme_rdma_trace_ibv_sge(wr->sg_list); + return nvme_rdma_qpair_queue_send_wr(rqpair, wr); +} + +static int +nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + /* Currently, doing nothing here */ + return 0; +} + +static void +nvme_rdma_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + cpl.status.dnr = dnr; + + /* + * We cannot abort requests at the RDMA layer without + * unregistering them. If we do, we can still get error + * free completions on the shared completion queue. + */ + if (nvme_qpair_get_state(qpair) > NVME_QPAIR_DISCONNECTING && + nvme_qpair_get_state(qpair) != NVME_QPAIR_DESTROYING) { + nvme_ctrlr_disconnect_qpair(qpair); + } + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + nvme_rdma_req_complete(rdma_req, &cpl); + nvme_rdma_req_put(rqpair, rdma_req); + } +} + +static void +nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static inline int +nvme_rdma_request_ready(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req) +{ + nvme_rdma_req_complete(rdma_req, &rqpair->rsps[rdma_req->rsp_idx].cpl); + nvme_rdma_req_put(rqpair, rdma_req); + return nvme_rdma_post_recv(rqpair, rdma_req->rsp_idx); +} + +#define MAX_COMPLETIONS_PER_POLL 128 + +static void +nvme_rdma_fail_qpair(struct spdk_nvme_qpair *qpair, int failure_reason) +{ + if (failure_reason == IBV_WC_RETRY_EXC_ERR) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_REMOTE; + } else if (qpair->transport_failure_reason == SPDK_NVME_QPAIR_FAILURE_NONE) { + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + } + + nvme_ctrlr_disconnect_qpair(qpair); +} + +static void +nvme_rdma_conditional_fail_qpair(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poll_group *group) +{ + struct nvme_rdma_destroyed_qpair *qpair_tracker; + + assert(rqpair); + if (group) { + STAILQ_FOREACH(qpair_tracker, &group->destroyed_qpairs, link) { + if (qpair_tracker->destroyed_qpair_tracker == rqpair) { + return; + } + } + } + nvme_rdma_fail_qpair(&rqpair->qpair, 0); +} + +static int +nvme_rdma_cq_process_completions(struct ibv_cq *cq, uint32_t batch_size, + struct nvme_rdma_poll_group *group, + struct nvme_rdma_qpair *rdma_qpair) +{ + struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL]; + struct nvme_rdma_qpair *rqpair; + struct spdk_nvme_rdma_req *rdma_req; + struct spdk_nvme_rdma_rsp *rdma_rsp; + struct nvme_rdma_wr *rdma_wr; + uint32_t reaped = 0; + int completion_rc = 0; + int rc, i; + + rc = ibv_poll_cq(cq, batch_size, wc); + if (rc < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -ECANCELED; + } else if (rc == 0) { + return 0; + } + + for (i = 0; i < rc; i++) { + rdma_wr = (struct nvme_rdma_wr *)wc[i].wr_id; + switch (rdma_wr->type) { + case RDMA_WR_TYPE_RECV: + rdma_rsp = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_rsp, rdma_wr); + rqpair = rdma_rsp->rqpair; + assert(rqpair->current_num_recvs > 0); + rqpair->current_num_recvs--; + + if (wc[i].status) { + SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", + rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n"); + + if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + rdma_req = &rqpair->rdma_reqs[rdma_rsp->cpl.cid]; + rdma_req->completion_flags |= NVME_RDMA_RECV_COMPLETED; + rdma_req->rsp_idx = rdma_rsp->idx; + + if ((rdma_req->completion_flags & NVME_RDMA_SEND_COMPLETED) != 0) { + if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + reaped++; + rqpair->num_completions++; + } + break; + + case RDMA_WR_TYPE_SEND: + rdma_req = SPDK_CONTAINEROF(rdma_wr, struct spdk_nvme_rdma_req, rdma_wr); + + /* If we are flushing I/O */ + if (wc[i].status) { + rqpair = rdma_req->req ? nvme_rdma_qpair(rdma_req->req->qpair) : NULL; + if (!rqpair) { + rqpair = rdma_qpair != NULL ? rdma_qpair : nvme_rdma_poll_group_get_qpair_by_id(group, + wc[i].qp_num); + } + assert(rqpair); + assert(rqpair->current_num_sends > 0); + rqpair->current_num_sends--; + nvme_rdma_conditional_fail_qpair(rqpair, group); + SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n", + rqpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + completion_rc = -ENXIO; + continue; + } + + rqpair = nvme_rdma_qpair(rdma_req->req->qpair); + rdma_req->completion_flags |= NVME_RDMA_SEND_COMPLETED; + rqpair->current_num_sends--; + + if ((rdma_req->completion_flags & NVME_RDMA_RECV_COMPLETED) != 0) { + if (spdk_unlikely(nvme_rdma_request_ready(rqpair, rdma_req))) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + nvme_rdma_conditional_fail_qpair(rqpair, group); + completion_rc = -ENXIO; + continue; + } + reaped++; + rqpair->num_completions++; + } + break; + + default: + SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", rdma_wr->type); + return -ECANCELED; + } + } + + if (completion_rc) { + return completion_rc; + } + + return reaped; +} + +static void +dummy_disconnected_qpair_cb(struct spdk_nvme_qpair *qpair, void *poll_group_ctx) +{ + +} + +static int +nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair, + uint32_t max_completions) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + int rc = 0, batch_size; + struct ibv_cq *cq; + struct nvme_rdma_ctrlr *rctrlr; + + /* + * This is used during the connection phase. It's possible that we are still reaping error completions + * from other qpairs so we need to call the poll group function. Also, it's more correct since the cq + * is shared. + */ + if (qpair->poll_group != NULL) { + return spdk_nvme_poll_group_process_completions(qpair->poll_group->group, max_completions, + dummy_disconnected_qpair_cb); + } + + if (max_completions == 0) { + max_completions = rqpair->num_entries; + } else { + max_completions = spdk_min(max_completions, rqpair->num_entries); + } + + if (nvme_qpair_is_admin_queue(&rqpair->qpair)) { + rctrlr = nvme_rdma_ctrlr(rqpair->qpair.ctrlr); + nvme_rdma_poll_events(rctrlr); + } + nvme_rdma_qpair_process_cm_event(rqpair); + + if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } + + cq = rqpair->cq; + + rqpair->num_completions = 0; + do { + batch_size = spdk_min((max_completions - rqpair->num_completions), MAX_COMPLETIONS_PER_POLL); + rc = nvme_rdma_cq_process_completions(cq, batch_size, NULL, rqpair); + + if (rc == 0) { + break; + /* Handle the case where we fail to poll the cq. */ + } else if (rc == -ECANCELED) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } else if (rc == -ENXIO) { + return rc; + } + } while (rqpair->num_completions < max_completions); + + if (spdk_unlikely(nvme_rdma_qpair_submit_sends(rqpair) || + nvme_rdma_qpair_submit_recvs(rqpair))) { + nvme_rdma_fail_qpair(qpair, 0); + return -ENXIO; + } + + if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) { + nvme_rdma_qpair_check_timeout(qpair); + } + + return rqpair->num_completions; +} + +static uint32_t +nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* max_mr_size by ibv_query_device indicates the largest value that we can + * set for a registered memory region. It is independent from the actual + * I/O size and is very likely to be larger than 2 MiB which is the + * granularity we currently register memory regions. Hence return + * UINT32_MAX here and let the generic layer use the controller data to + * moderate this value. + */ + return UINT32_MAX; +} + +static uint16_t +nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr); + + return rctrlr->max_sge; +} + +static int +nvme_rdma_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct spdk_nvme_rdma_req *rdma_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + rc = iter_fn(rdma_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_rdma_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_rdma_req *rdma_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) { + assert(rdma_req->req != NULL); + + if (rdma_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + continue; + } + + nvme_rdma_req_complete(rdma_req, &cpl); + nvme_rdma_req_put(rqpair, rdma_req); + } +} + +static int +nvme_rdma_poller_create(struct nvme_rdma_poll_group *group, struct ibv_context *ctx) +{ + struct nvme_rdma_poller *poller; + + poller = calloc(1, sizeof(*poller)); + if (poller == NULL) { + SPDK_ERRLOG("Unable to allocate poller.\n"); + return -ENOMEM; + } + + poller->device = ctx; + poller->cq = ibv_create_cq(poller->device, DEFAULT_NVME_RDMA_CQ_SIZE, group, NULL, 0); + + if (poller->cq == NULL) { + free(poller); + return -EINVAL; + } + + STAILQ_INSERT_HEAD(&group->pollers, poller, link); + group->num_pollers++; + poller->current_num_wc = DEFAULT_NVME_RDMA_CQ_SIZE; + poller->required_num_wc = 0; + return 0; +} + +static void +nvme_rdma_poll_group_free_pollers(struct nvme_rdma_poll_group *group) +{ + struct nvme_rdma_poller *poller, *tmp_poller; + + STAILQ_FOREACH_SAFE(poller, &group->pollers, link, tmp_poller) { + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + STAILQ_REMOVE(&group->pollers, poller, nvme_rdma_poller, link); + free(poller); + } +} + +static struct spdk_nvme_transport_poll_group * +nvme_rdma_poll_group_create(void) +{ + struct nvme_rdma_poll_group *group; + struct ibv_context **contexts; + int i = 0; + + group = calloc(1, sizeof(*group)); + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + STAILQ_INIT(&group->pollers); + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + free(group); + return NULL; + } + + while (contexts[i] != NULL) { + if (nvme_rdma_poller_create(group, contexts[i])) { + nvme_rdma_poll_group_free_pollers(group); + free(group); + rdma_free_devices(contexts); + return NULL; + } + i++; + } + + rdma_free_devices(contexts); + STAILQ_INIT(&group->destroyed_qpairs); + return &group->group; +} + +struct nvme_rdma_qpair * +nvme_rdma_poll_group_get_qpair_by_id(struct nvme_rdma_poll_group *group, uint32_t qp_num) +{ + struct spdk_nvme_qpair *qpair; + struct nvme_rdma_destroyed_qpair *rqpair_tracker; + struct nvme_rdma_qpair *rqpair; + + STAILQ_FOREACH(qpair, &group->group.disconnected_qpairs, poll_group_stailq) { + rqpair = nvme_rdma_qpair(qpair); + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + STAILQ_FOREACH(qpair, &group->group.connected_qpairs, poll_group_stailq) { + rqpair = nvme_rdma_qpair(qpair); + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + STAILQ_FOREACH(rqpair_tracker, &group->destroyed_qpairs, link) { + rqpair = rqpair_tracker->destroyed_qpair_tracker; + if (rqpair->rdma_qp->qp->qp_num == qp_num) { + return rqpair; + } + } + + return NULL; +} + +static int +nvme_rdma_resize_cq(struct nvme_rdma_qpair *rqpair, struct nvme_rdma_poller *poller) +{ + int current_num_wc, required_num_wc; + + required_num_wc = poller->required_num_wc + WC_PER_QPAIR(rqpair->num_entries); + current_num_wc = poller->current_num_wc; + if (current_num_wc < required_num_wc) { + current_num_wc = spdk_max(current_num_wc * 2, required_num_wc); + } + + if (poller->current_num_wc != current_num_wc) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Resize RDMA CQ from %d to %d\n", poller->current_num_wc, + current_num_wc); + if (ibv_resize_cq(poller->cq, current_num_wc)) { + SPDK_ERRLOG("RDMA CQ resize failed: errno %d: %s\n", errno, spdk_strerror(errno)); + return -1; + } + + poller->current_num_wc = current_num_wc; + } + + poller->required_num_wc = required_num_wc; + return 0; +} + +static int +nvme_rdma_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(qpair->poll_group); + struct nvme_rdma_poller *poller; + + assert(rqpair->cq == NULL); + + STAILQ_FOREACH(poller, &group->pollers, link) { + if (poller->device == rqpair->cm_id->verbs) { + if (nvme_rdma_resize_cq(rqpair, poller)) { + return -EPROTO; + } + rqpair->cq = poller->cq; + break; + } + } + + if (rqpair->cq == NULL) { + SPDK_ERRLOG("Unable to find a cq for qpair %p on poll group %p\n", qpair, qpair->poll_group); + return -EINVAL; + } + + return 0; +} + +static int +nvme_rdma_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair); + struct nvme_rdma_poll_group *group; + struct nvme_rdma_destroyed_qpair *destroyed_qpair; + enum nvme_qpair_state state; + + if (rqpair->poll_group_disconnect_in_progress) { + return -EINPROGRESS; + } + + rqpair->poll_group_disconnect_in_progress = true; + state = nvme_qpair_get_state(qpair); + group = nvme_rdma_poll_group(qpair->poll_group); + rqpair->cq = NULL; + + /* + * We want to guard against an endless recursive loop while making + * sure the qpair is disconnected before we disconnect it from the qpair. + */ + if (state > NVME_QPAIR_DISCONNECTING && state != NVME_QPAIR_DESTROYING) { + nvme_ctrlr_disconnect_qpair(qpair); + } + + /* + * If this fails, the system is in serious trouble, + * just let the qpair get cleaned up immediately. + */ + destroyed_qpair = calloc(1, sizeof(*destroyed_qpair)); + if (destroyed_qpair == NULL) { + return 0; + } + + destroyed_qpair->destroyed_qpair_tracker = rqpair; + destroyed_qpair->completed_cycles = 0; + STAILQ_INSERT_TAIL(&group->destroyed_qpairs, destroyed_qpair, link); + + rqpair->defer_deletion_to_pg = true; + + rqpair->poll_group_disconnect_in_progress = false; + return 0; +} + +static int +nvme_rdma_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static int +nvme_rdma_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return nvme_poll_group_disconnect_qpair(qpair); + } + + return 0; +} + +static void +nvme_rdma_poll_group_delete_qpair(struct nvme_rdma_poll_group *group, + struct nvme_rdma_destroyed_qpair *qpair_tracker) +{ + struct nvme_rdma_qpair *rqpair = qpair_tracker->destroyed_qpair_tracker; + + rqpair->defer_deletion_to_pg = false; + if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { + nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); + } + STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); + free(qpair_tracker); +} + +static int64_t +nvme_rdma_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair, *tmp_qpair; + struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; + struct nvme_rdma_qpair *rqpair; + struct nvme_rdma_poll_group *group; + struct nvme_rdma_poller *poller; + int num_qpairs = 0, batch_size, rc; + int64_t total_completions = 0; + uint64_t completions_allowed = 0; + uint64_t completions_per_poller = 0; + uint64_t poller_completions = 0; + + + if (completions_per_qpair == 0) { + completions_per_qpair = MAX_COMPLETIONS_PER_POLL; + } + + group = nvme_rdma_poll_group(tgroup); + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + rqpair = nvme_rdma_qpair(qpair); + rqpair->num_completions = 0; + nvme_rdma_qpair_process_cm_event(rqpair); + + if (spdk_unlikely(qpair->transport_failure_reason != SPDK_NVME_QPAIR_FAILURE_NONE)) { + nvme_rdma_fail_qpair(qpair, 0); + disconnected_qpair_cb(qpair, tgroup->group->ctx); + continue; + } + num_qpairs++; + } + + completions_allowed = completions_per_qpair * num_qpairs; + completions_per_poller = spdk_max(completions_allowed / group->num_pollers, 1); + + STAILQ_FOREACH(poller, &group->pollers, link) { + poller_completions = 0; + do { + batch_size = spdk_min((completions_per_poller - poller_completions), MAX_COMPLETIONS_PER_POLL); + rc = nvme_rdma_cq_process_completions(poller->cq, batch_size, group, NULL); + if (rc <= 0) { + if (rc == -ECANCELED) { + return -EIO; + } + break; + } + + poller_completions += rc; + } while (poller_completions < completions_per_poller); + total_completions += poller_completions; + } + + STAILQ_FOREACH_SAFE(qpair, &tgroup->connected_qpairs, poll_group_stailq, tmp_qpair) { + rqpair = nvme_rdma_qpair(qpair); + if (spdk_unlikely(qpair->ctrlr->timeout_enabled)) { + nvme_rdma_qpair_check_timeout(qpair); + } + + nvme_rdma_qpair_submit_sends(rqpair); + nvme_rdma_qpair_submit_recvs(rqpair); + nvme_qpair_resubmit_requests(&rqpair->qpair, rqpair->num_completions); + } + + /* + * Once a qpair is disconnected, we can still get flushed completions for those disconnected qpairs. + * For most pieces of hardware, those requests will complete immediately. However, there are certain + * cases where flushed requests will linger. Default is to destroy qpair after all completions are freed, + * but have a fallback for other cases where we don't get all of our completions back. + */ + STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { + qpair_tracker->completed_cycles++; + rqpair = qpair_tracker->destroyed_qpair_tracker; + if ((rqpair->current_num_sends == 0 && rqpair->current_num_recvs == 0) || + qpair_tracker->completed_cycles > NVME_RDMA_DESTROYED_QPAIR_EXPIRATION_CYCLES) { + nvme_rdma_poll_group_delete_qpair(group, qpair_tracker); + } + } + + return total_completions; +} + +static int +nvme_rdma_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + struct nvme_rdma_poll_group *group = nvme_rdma_poll_group(tgroup); + struct nvme_rdma_destroyed_qpair *qpair_tracker, *tmp_qpair_tracker; + struct nvme_rdma_qpair *rqpair; + + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + STAILQ_FOREACH_SAFE(qpair_tracker, &group->destroyed_qpairs, link, tmp_qpair_tracker) { + rqpair = qpair_tracker->destroyed_qpair_tracker; + if (nvme_qpair_get_state(&rqpair->qpair) == NVME_QPAIR_DESTROYING) { + rqpair->defer_deletion_to_pg = false; + nvme_rdma_ctrlr_delete_io_qpair(rqpair->qpair.ctrlr, &rqpair->qpair); + } + + STAILQ_REMOVE(&group->destroyed_qpairs, qpair_tracker, nvme_rdma_destroyed_qpair, link); + free(qpair_tracker); + } + + nvme_rdma_poll_group_free_pollers(group); + free(group); + + return 0; +} + +void +spdk_nvme_rdma_init_hooks(struct spdk_nvme_rdma_hooks *hooks) +{ + g_nvme_hooks = *hooks; +} + +const struct spdk_nvme_transport_ops rdma_ops = { + .name = "RDMA", + .type = SPDK_NVME_TRANSPORT_RDMA, + .ctrlr_construct = nvme_rdma_ctrlr_construct, + .ctrlr_scan = nvme_fabric_ctrlr_scan, + .ctrlr_destruct = nvme_rdma_ctrlr_destruct, + .ctrlr_enable = nvme_rdma_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_rdma_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_rdma_ctrlr_get_max_sges, + + .ctrlr_create_io_qpair = nvme_rdma_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_rdma_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_rdma_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_rdma_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_rdma_qpair_abort_reqs, + .qpair_reset = nvme_rdma_qpair_reset, + .qpair_submit_request = nvme_rdma_qpair_submit_request, + .qpair_process_completions = nvme_rdma_qpair_process_completions, + .qpair_iterate_requests = nvme_rdma_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_rdma_admin_qpair_abort_aers, + + .poll_group_create = nvme_rdma_poll_group_create, + .poll_group_connect_qpair = nvme_rdma_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_rdma_poll_group_disconnect_qpair, + .poll_group_add = nvme_rdma_poll_group_add, + .poll_group_remove = nvme_rdma_poll_group_remove, + .poll_group_process_completions = nvme_rdma_poll_group_process_completions, + .poll_group_destroy = nvme_rdma_poll_group_destroy, + +}; + +SPDK_NVME_TRANSPORT_REGISTER(rdma, &rdma_ops); diff --git a/src/spdk/lib/nvme/nvme_tcp.c b/src/spdk/lib/nvme/nvme_tcp.c new file mode 100644 index 000000000..98e8c6827 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_tcp.c @@ -0,0 +1,1973 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. All rights reserved. + * Copyright (c) 2020 Mellanox Technologies LTD. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe/TCP transport + */ + +#include "nvme_internal.h" + +#include "spdk/endian.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/stdinc.h" +#include "spdk/crc32.h" +#include "spdk/endian.h" +#include "spdk/assert.h" +#include "spdk/string.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/nvme_tcp.h" + +#define NVME_TCP_RW_BUFFER_SIZE 131072 +#define NVME_TCP_TIME_OUT_IN_SECONDS 2 + +#define NVME_TCP_HPDA_DEFAULT 0 +#define NVME_TCP_MAX_R2T_DEFAULT 1 +#define NVME_TCP_PDU_H2C_MIN_DATA_SIZE 4096 +#define NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE 8192 + +/* NVMe TCP transport extensions for spdk_nvme_ctrlr */ +struct nvme_tcp_ctrlr { + struct spdk_nvme_ctrlr ctrlr; +}; + +struct nvme_tcp_poll_group { + struct spdk_nvme_transport_poll_group group; + struct spdk_sock_group *sock_group; + uint32_t completions_per_qpair; + int64_t num_completions; +}; + +/* NVMe TCP qpair extensions for spdk_nvme_qpair */ +struct nvme_tcp_qpair { + struct spdk_nvme_qpair qpair; + struct spdk_sock *sock; + + TAILQ_HEAD(, nvme_tcp_req) free_reqs; + TAILQ_HEAD(, nvme_tcp_req) outstanding_reqs; + + TAILQ_HEAD(, nvme_tcp_pdu) send_queue; + struct nvme_tcp_pdu recv_pdu; + struct nvme_tcp_pdu send_pdu; /* only for error pdu and init pdu */ + struct nvme_tcp_pdu *send_pdus; /* Used by tcp_reqs */ + enum nvme_tcp_pdu_recv_state recv_state; + + struct nvme_tcp_req *tcp_reqs; + + uint16_t num_entries; + + bool host_hdgst_enable; + bool host_ddgst_enable; + + /** Specifies the maximum number of PDU-Data bytes per H2C Data Transfer PDU */ + uint32_t maxh2cdata; + + uint32_t maxr2t; + + /* 0 based value, which is used to guide the padding */ + uint8_t cpda; + + enum nvme_tcp_qpair_state state; +}; + +enum nvme_tcp_req_state { + NVME_TCP_REQ_FREE, + NVME_TCP_REQ_ACTIVE, + NVME_TCP_REQ_ACTIVE_R2T, +}; + +struct nvme_tcp_req { + struct nvme_request *req; + enum nvme_tcp_req_state state; + uint16_t cid; + uint16_t ttag; + uint32_t datao; + uint32_t r2tl_remain; + uint32_t active_r2ts; + bool in_capsule_data; + /* It is used to track whether the req can be safely freed */ + struct { + uint8_t send_ack : 1; + uint8_t data_recv : 1; + uint8_t r2t_recv : 1; + uint8_t reserved : 5; + } ordering; + struct nvme_tcp_pdu *send_pdu; + struct iovec iov[NVME_TCP_MAX_SGL_DESCRIPTORS]; + uint32_t iovcnt; + struct nvme_tcp_qpair *tqpair; + TAILQ_ENTRY(nvme_tcp_req) link; +}; + +static void nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req); + +static inline struct nvme_tcp_qpair * +nvme_tcp_qpair(struct spdk_nvme_qpair *qpair) +{ + assert(qpair->trtype == SPDK_NVME_TRANSPORT_TCP); + return SPDK_CONTAINEROF(qpair, struct nvme_tcp_qpair, qpair); +} + +static inline struct nvme_tcp_poll_group * +nvme_tcp_poll_group(struct spdk_nvme_transport_poll_group *group) +{ + return SPDK_CONTAINEROF(group, struct nvme_tcp_poll_group, group); +} + +static inline struct nvme_tcp_ctrlr * +nvme_tcp_ctrlr(struct spdk_nvme_ctrlr *ctrlr) +{ + assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_TCP); + return SPDK_CONTAINEROF(ctrlr, struct nvme_tcp_ctrlr, ctrlr); +} + +static struct nvme_tcp_req * +nvme_tcp_req_get(struct nvme_tcp_qpair *tqpair) +{ + struct nvme_tcp_req *tcp_req; + + tcp_req = TAILQ_FIRST(&tqpair->free_reqs); + if (!tcp_req) { + return NULL; + } + + assert(tcp_req->state == NVME_TCP_REQ_FREE); + tcp_req->state = NVME_TCP_REQ_ACTIVE; + TAILQ_REMOVE(&tqpair->free_reqs, tcp_req, link); + tcp_req->datao = 0; + tcp_req->req = NULL; + tcp_req->in_capsule_data = false; + tcp_req->r2tl_remain = 0; + tcp_req->active_r2ts = 0; + tcp_req->iovcnt = 0; + tcp_req->ordering.send_ack = 0; + tcp_req->ordering.data_recv = 0; + tcp_req->ordering.r2t_recv = 0; + memset(tcp_req->send_pdu, 0, sizeof(struct nvme_tcp_pdu)); + TAILQ_INSERT_TAIL(&tqpair->outstanding_reqs, tcp_req, link); + + return tcp_req; +} + +static void +nvme_tcp_req_put(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + assert(tcp_req->state != NVME_TCP_REQ_FREE); + tcp_req->state = NVME_TCP_REQ_FREE; + TAILQ_INSERT_HEAD(&tqpair->free_reqs, tcp_req, link); +} + +static int +nvme_tcp_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service) +{ + struct addrinfo *res; + struct addrinfo hints; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + ret = getaddrinfo(addr, service, &hints, &res); + if (ret) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret); + return ret; + } + + if (res->ai_addrlen > sizeof(*sa)) { + SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen); + ret = EINVAL; + } else { + memcpy(sa, res->ai_addr, res->ai_addrlen); + } + + freeaddrinfo(res); + return ret; +} + +static void +nvme_tcp_free_reqs(struct nvme_tcp_qpair *tqpair) +{ + free(tqpair->tcp_reqs); + tqpair->tcp_reqs = NULL; + + spdk_free(tqpair->send_pdus); + tqpair->send_pdus = NULL; +} + +static int +nvme_tcp_alloc_reqs(struct nvme_tcp_qpair *tqpair) +{ + uint16_t i; + struct nvme_tcp_req *tcp_req; + + tqpair->tcp_reqs = calloc(tqpair->num_entries, sizeof(struct nvme_tcp_req)); + if (tqpair->tcp_reqs == NULL) { + SPDK_ERRLOG("Failed to allocate tcp_reqs on tqpair=%p\n", tqpair); + goto fail; + } + + tqpair->send_pdus = spdk_zmalloc(tqpair->num_entries * sizeof(struct nvme_tcp_pdu), + 0x1000, NULL, + SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA); + + if (tqpair->send_pdus == NULL) { + SPDK_ERRLOG("Failed to allocate send_pdus on tqpair=%p\n", tqpair); + goto fail; + } + + TAILQ_INIT(&tqpair->send_queue); + TAILQ_INIT(&tqpair->free_reqs); + TAILQ_INIT(&tqpair->outstanding_reqs); + for (i = 0; i < tqpair->num_entries; i++) { + tcp_req = &tqpair->tcp_reqs[i]; + tcp_req->cid = i; + tcp_req->tqpair = tqpair; + tcp_req->send_pdu = &tqpair->send_pdus[i]; + TAILQ_INSERT_TAIL(&tqpair->free_reqs, tcp_req, link); + } + + return 0; +fail: + nvme_tcp_free_reqs(tqpair); + return -ENOMEM; +} + +static void +nvme_tcp_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_pdu *pdu; + + spdk_sock_close(&tqpair->sock); + + /* clear the send_queue */ + while (!TAILQ_EMPTY(&tqpair->send_queue)) { + pdu = TAILQ_FIRST(&tqpair->send_queue); + /* Remove the pdu from the send_queue to prevent the wrong sending out + * in the next round connection + */ + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); + } +} + +static void nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr); + +static int +nvme_tcp_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair; + + if (!qpair) { + return -1; + } + + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_tcp_qpair_abort_reqs(qpair, 1); + nvme_qpair_deinit(qpair); + tqpair = nvme_tcp_qpair(qpair); + nvme_tcp_free_reqs(tqpair); + free(tqpair); + + return 0; +} + +static int +nvme_tcp_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + return 0; +} + +static int +nvme_tcp_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + struct nvme_tcp_ctrlr *tctrlr = nvme_tcp_ctrlr(ctrlr); + + if (ctrlr->adminq) { + nvme_tcp_ctrlr_delete_io_qpair(ctrlr, ctrlr->adminq); + } + + nvme_ctrlr_destruct_finish(ctrlr); + + free(tctrlr); + + return 0; +} + +static void +_pdu_write_done(void *cb_arg, int err) +{ + struct nvme_tcp_pdu *pdu = cb_arg; + struct nvme_tcp_qpair *tqpair = pdu->qpair; + + TAILQ_REMOVE(&tqpair->send_queue, pdu, tailq); + + if (err != 0) { + nvme_transport_ctrlr_disconnect_qpair(tqpair->qpair.ctrlr, &tqpair->qpair); + return; + } + + assert(pdu->cb_fn != NULL); + pdu->cb_fn(pdu->cb_arg); +} + +static int +nvme_tcp_qpair_write_pdu(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu, + nvme_tcp_qpair_xfer_complete_cb cb_fn, + void *cb_arg) +{ + int hlen; + uint32_t crc32c; + uint32_t mapped_length = 0; + + hlen = pdu->hdr.common.hlen; + + /* Header Digest */ + if (g_nvme_tcp_hdgst[pdu->hdr.common.pdu_type] && tqpair->host_hdgst_enable) { + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + MAKE_DIGEST_WORD((uint8_t *)pdu->hdr.raw + hlen, crc32c); + } + + /* Data Digest */ + if (pdu->data_len > 0 && g_nvme_tcp_ddgst[pdu->hdr.common.pdu_type] && tqpair->host_ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + MAKE_DIGEST_WORD(pdu->data_digest, crc32c); + } + + pdu->cb_fn = cb_fn; + pdu->cb_arg = cb_arg; + + pdu->sock_req.iovcnt = nvme_tcp_build_iovs(pdu->iov, NVME_TCP_MAX_SGL_DESCRIPTORS, pdu, + tqpair->host_hdgst_enable, tqpair->host_ddgst_enable, + &mapped_length); + pdu->qpair = tqpair; + pdu->sock_req.cb_fn = _pdu_write_done; + pdu->sock_req.cb_arg = pdu; + TAILQ_INSERT_TAIL(&tqpair->send_queue, pdu, tailq); + spdk_sock_writev_async(tqpair->sock, &pdu->sock_req); + + return 0; +} + +/* + * Build SGL describing contiguous payload buffer. + */ +static int +nvme_tcp_build_contig_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + struct nvme_request *req = tcp_req->req; + + tcp_req->iov[0].iov_base = req->payload.contig_or_cb_arg + req->payload_offset; + tcp_req->iov[0].iov_len = req->payload_size; + tcp_req->iovcnt = 1; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG); + + return 0; +} + +/* + * Build SGL describing scattered payload buffer. + */ +static int +nvme_tcp_build_sgl_request(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_req *tcp_req) +{ + int rc; + uint32_t length, remaining_size, iovcnt = 0, max_num_sgl; + struct nvme_request *req = tcp_req->req; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + assert(req->payload_size != 0); + assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL); + assert(req->payload.reset_sgl_fn != NULL); + assert(req->payload.next_sge_fn != NULL); + req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset); + + max_num_sgl = spdk_min(req->qpair->ctrlr->max_sges, NVME_TCP_MAX_SGL_DESCRIPTORS); + remaining_size = req->payload_size; + + do { + rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &tcp_req->iov[iovcnt].iov_base, + &length); + if (rc) { + return -1; + } + + length = spdk_min(length, remaining_size); + tcp_req->iov[iovcnt].iov_len = length; + remaining_size -= length; + iovcnt++; + } while (remaining_size > 0 && iovcnt < max_num_sgl); + + + /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */ + if (remaining_size > 0) { + SPDK_ERRLOG("Failed to construct tcp_req=%p, and the iovcnt=%u, remaining_size=%u\n", + tcp_req, iovcnt, remaining_size); + return -1; + } + + tcp_req->iovcnt = iovcnt; + + return 0; +} + +static int +nvme_tcp_req_init(struct nvme_tcp_qpair *tqpair, struct nvme_request *req, + struct nvme_tcp_req *tcp_req) +{ + struct spdk_nvme_ctrlr *ctrlr = tqpair->qpair.ctrlr; + int rc = 0; + enum spdk_nvme_data_transfer xfer; + uint32_t max_incapsule_data_size; + + tcp_req->req = req; + req->cmd.cid = tcp_req->cid; + req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG; + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_TRANSPORT; + req->cmd.dptr.sgl1.unkeyed.length = req->payload_size; + + if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) { + rc = nvme_tcp_build_contig_request(tqpair, tcp_req); + } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) { + rc = nvme_tcp_build_sgl_request(tqpair, tcp_req); + } else { + rc = -1; + } + + if (rc) { + return rc; + } + + if (req->cmd.opc == SPDK_NVME_OPC_FABRIC) { + struct spdk_nvmf_capsule_cmd *nvmf_cmd = (struct spdk_nvmf_capsule_cmd *)&req->cmd; + + xfer = spdk_nvme_opc_get_data_transfer(nvmf_cmd->fctype); + } else { + xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc); + } + if (xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + max_incapsule_data_size = ctrlr->ioccsz_bytes; + if ((req->cmd.opc == SPDK_NVME_OPC_FABRIC) || nvme_qpair_is_admin_queue(&tqpair->qpair)) { + max_incapsule_data_size = spdk_min(max_incapsule_data_size, NVME_TCP_IN_CAPSULE_DATA_MAX_SIZE); + } + + if (req->payload_size <= max_incapsule_data_size) { + req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK; + req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET; + req->cmd.dptr.sgl1.address = 0; + tcp_req->in_capsule_data = true; + } + } + + return 0; +} + +static inline void +nvme_tcp_req_put_safe(struct nvme_tcp_req *tcp_req) +{ + if (tcp_req->ordering.send_ack && tcp_req->ordering.data_recv) { + assert(tcp_req->state == NVME_TCP_REQ_ACTIVE); + assert(tcp_req->tqpair != NULL); + nvme_tcp_req_put(tcp_req->tqpair, tcp_req); + } +} + +static void +nvme_tcp_qpair_cmd_send_complete(void *cb_arg) +{ + struct nvme_tcp_req *tcp_req = cb_arg; + + tcp_req->ordering.send_ack = 1; + /* Handle the r2t case */ + if (spdk_unlikely(tcp_req->ordering.r2t_recv)) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + nvme_tcp_req_put_safe(tcp_req); + } +} + +static int +nvme_tcp_qpair_capsule_cmd_send(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_req *tcp_req) +{ + struct nvme_tcp_pdu *pdu; + struct spdk_nvme_tcp_cmd *capsule_cmd; + uint32_t plen = 0, alignment; + uint8_t pdo; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + pdu = tcp_req->send_pdu; + + capsule_cmd = &pdu->hdr.capsule_cmd; + capsule_cmd->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_CAPSULE_CMD; + plen = capsule_cmd->common.hlen = sizeof(*capsule_cmd); + capsule_cmd->ccsqe = tcp_req->req->cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "capsule_cmd cid=%u on tqpair(%p)\n", tcp_req->req->cmd.cid, tqpair); + + if (tqpair->host_hdgst_enable) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Header digest is enabled for capsule command on tcp_req=%p\n", + tcp_req); + capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + if ((tcp_req->req->payload_size == 0) || !tcp_req->in_capsule_data) { + goto end; + } + + pdo = plen; + pdu->padding_len = 0; + if (tqpair->cpda) { + alignment = (tqpair->cpda + 1) << 2; + if (alignment > plen) { + pdu->padding_len = alignment - plen; + pdo = alignment; + plen = alignment; + } + } + + capsule_cmd->common.pdo = pdo; + plen += tcp_req->req->payload_size; + if (tqpair->host_ddgst_enable) { + capsule_cmd->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + tcp_req->datao = 0; + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, + 0, tcp_req->req->payload_size); +end: + capsule_cmd->common.plen = plen; + return nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_qpair_cmd_send_complete, tcp_req); + +} + +static int +nvme_tcp_qpair_submit_request(struct spdk_nvme_qpair *qpair, + struct nvme_request *req) +{ + struct nvme_tcp_qpair *tqpair; + struct nvme_tcp_req *tcp_req; + + tqpair = nvme_tcp_qpair(qpair); + assert(tqpair != NULL); + assert(req != NULL); + + tcp_req = nvme_tcp_req_get(tqpair); + if (!tcp_req) { + /* Inform the upper layer to try again later. */ + return -EAGAIN; + } + + if (nvme_tcp_req_init(tqpair, req, tcp_req)) { + SPDK_ERRLOG("nvme_tcp_req_init() failed\n"); + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); + nvme_tcp_req_put(tqpair, tcp_req); + return -1; + } + + return nvme_tcp_qpair_capsule_cmd_send(tqpair, tcp_req); +} + +static int +nvme_tcp_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + return 0; +} + +static void +nvme_tcp_req_complete(struct nvme_tcp_req *tcp_req, + struct spdk_nvme_cpl *rsp) +{ + struct nvme_request *req; + + assert(tcp_req->req != NULL); + req = tcp_req->req; + + TAILQ_REMOVE(&tcp_req->tqpair->outstanding_reqs, tcp_req, link); + nvme_complete_request(req->cb_fn, req->cb_arg, req->qpair, req, rsp); + nvme_free_request(req); +} + +static void +nvme_tcp_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + struct nvme_tcp_req *tcp_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + cpl.status.dnr = dnr; + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + nvme_tcp_req_complete(tcp_req, &cpl); + nvme_tcp_req_put(tqpair, tcp_req); + } +} + +static void +nvme_tcp_qpair_set_recv_state(struct nvme_tcp_qpair *tqpair, + enum nvme_tcp_pdu_recv_state state) +{ + if (tqpair->recv_state == state) { + SPDK_ERRLOG("The recv state of tqpair=%p is same with the state(%d) to be set\n", + tqpair, state); + return; + } + + tqpair->recv_state = state; + switch (state) { + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + case NVME_TCP_PDU_RECV_STATE_ERROR: + memset(&tqpair->recv_pdu, 0, sizeof(struct nvme_tcp_pdu)); + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + default: + break; + } +} + +static void +nvme_tcp_qpair_send_h2c_term_req_complete(void *cb_arg) +{ + struct nvme_tcp_qpair *tqpair = cb_arg; + + tqpair->state = NVME_TCP_QPAIR_STATE_EXITING; +} + +static void +nvme_tcp_qpair_send_h2c_term_req(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, + enum spdk_nvme_tcp_term_req_fes fes, uint32_t error_offset) +{ + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_term_req_hdr *h2c_term_req; + uint32_t h2c_term_req_hdr_len = sizeof(*h2c_term_req); + uint8_t copy_len; + + rsp_pdu = &tqpair->send_pdu; + memset(rsp_pdu, 0, sizeof(*rsp_pdu)); + h2c_term_req = &rsp_pdu->hdr.term_req; + h2c_term_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_TERM_REQ; + h2c_term_req->common.hlen = h2c_term_req_hdr_len; + + if ((fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + DSET32(&h2c_term_req->fei, error_offset); + } + + copy_len = pdu->hdr.common.hlen; + if (copy_len > SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE) { + copy_len = SPDK_NVME_TCP_TERM_REQ_ERROR_DATA_MAX_SIZE; + } + + /* Copy the error info into the buffer */ + memcpy((uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, pdu->hdr.raw, copy_len); + nvme_tcp_pdu_set_data(rsp_pdu, (uint8_t *)rsp_pdu->hdr.raw + h2c_term_req_hdr_len, copy_len); + + /* Contain the header len of the wrong received pdu */ + h2c_term_req->common.plen = h2c_term_req->common.hlen + copy_len; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_send_h2c_term_req_complete, NULL); + +} + +static void +nvme_tcp_pdu_ch_handle(struct nvme_tcp_qpair *tqpair) +{ + struct nvme_tcp_pdu *pdu; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + uint32_t expected_hlen, hd_len = 0; + bool plen_error = false; + + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "pdu type = %d\n", pdu->hdr.common.pdu_type); + if (pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_IC_RESP) { + if (tqpair->state != NVME_TCP_QPAIR_STATE_INVALID) { + SPDK_ERRLOG("Already received IC_RESP PDU, and we should reject this pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + expected_hlen = sizeof(struct spdk_nvme_tcp_ic_resp); + if (pdu->hdr.common.plen != expected_hlen) { + plen_error = true; + } + } else { + if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { + SPDK_ERRLOG("The TCP/IP tqpair connection is not negotitated\n"); + fes = SPDK_NVME_TCP_TERM_REQ_FES_PDU_SEQUENCE_ERROR; + goto err; + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: + expected_hlen = sizeof(struct spdk_nvme_tcp_rsp); + if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { + hd_len = SPDK_NVME_TCP_DIGEST_LEN; + } + + if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + expected_hlen = sizeof(struct spdk_nvme_tcp_c2h_data_hdr); + if (pdu->hdr.common.plen < pdu->hdr.common.pdo) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + expected_hlen = sizeof(struct spdk_nvme_tcp_term_req_hdr); + if ((pdu->hdr.common.plen <= expected_hlen) || + (pdu->hdr.common.plen > SPDK_NVME_TCP_TERM_REQ_PDU_MAX_SIZE)) { + plen_error = true; + } + break; + case SPDK_NVME_TCP_PDU_TYPE_R2T: + expected_hlen = sizeof(struct spdk_nvme_tcp_r2t_hdr); + if (pdu->hdr.common.flags & SPDK_NVME_TCP_CH_FLAGS_HDGSTF) { + hd_len = SPDK_NVME_TCP_DIGEST_LEN; + } + + if (pdu->hdr.common.plen != (expected_hlen + hd_len)) { + plen_error = true; + } + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, pdu_type); + goto err; + } + } + + if (pdu->hdr.common.hlen != expected_hlen) { + SPDK_ERRLOG("Expected PDU header length %u, got %u\n", + expected_hlen, pdu->hdr.common.hlen); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, hlen); + goto err; + + } else if (plen_error) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_common_pdu_hdr, plen); + goto err; + } else { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + nvme_tcp_pdu_calc_psh_len(&tqpair->recv_pdu, tqpair->host_hdgst_enable); + return; + } +err: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); +} + +static struct nvme_tcp_req * +get_nvme_active_req_by_cid(struct nvme_tcp_qpair *tqpair, uint32_t cid) +{ + assert(tqpair != NULL); + if ((cid >= tqpair->num_entries) || (tqpair->tcp_reqs[cid].state == NVME_TCP_REQ_FREE)) { + return NULL; + } + + return &tqpair->tcp_reqs[cid]; +} + +static void +nvme_tcp_c2h_data_payload_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu, uint32_t *reaped) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_c2h_data_hdr *c2h_data; + struct spdk_nvme_cpl cpl = {}; + uint8_t flags; + + tcp_req = pdu->req; + assert(tcp_req != NULL); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + c2h_data = &pdu->hdr.c2h_data; + tcp_req->datao += pdu->data_len; + flags = c2h_data->common.flags; + + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + if (flags & SPDK_NVME_TCP_C2H_DATA_FLAGS_SUCCESS) { + if (tcp_req->datao == tcp_req->req->payload_size) { + cpl.status.p = 0; + } else { + cpl.status.p = 1; + } + + cpl.cid = tcp_req->cid; + cpl.sqid = tqpair->qpair.id; + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); + } +} + +static const char *spdk_nvme_tcp_term_req_fes_str[] = { + "Invalid PDU Header Field", + "PDU Sequence Error", + "Header Digest Error", + "Data Transfer Out of Range", + "Data Transfer Limit Exceeded", + "Unsupported parameter", +}; + +static void +nvme_tcp_c2h_term_req_dump(struct spdk_nvme_tcp_term_req_hdr *c2h_term_req) +{ + SPDK_ERRLOG("Error info of pdu(%p): %s\n", c2h_term_req, + spdk_nvme_tcp_term_req_fes_str[c2h_term_req->fes]); + if ((c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD) || + (c2h_term_req->fes == SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER)) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "The offset from the start of the PDU header is %u\n", + DGET32(c2h_term_req->fei)); + } + /* we may also need to dump some other info here */ +} + +static void +nvme_tcp_c2h_term_req_payload_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + nvme_tcp_c2h_term_req_dump(&pdu->hdr.term_req); + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); +} + +static void +nvme_tcp_pdu_payload_handle(struct nvme_tcp_qpair *tqpair, + uint32_t *reaped) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + + /* check data digest if need */ + if (pdu->ddgst_enable) { + crc32c = nvme_tcp_pdu_calc_data_digest(pdu); + rc = MATCH_DIGEST_WORD(pdu->data_digest, crc32c); + if (rc == 0) { + SPDK_ERRLOG("data digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + nvme_tcp_c2h_data_payload_handle(tqpair, pdu, reaped); + break; + + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + nvme_tcp_c2h_term_req_payload_handle(tqpair, pdu); + break; + + default: + /* The code should not go to here */ + SPDK_ERRLOG("The code should not go to here\n"); + break; + } +} + +static void +nvme_tcp_send_icreq_complete(void *cb_arg) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Complete the icreq send for tqpair=%p\n", + (struct nvme_tcp_qpair *)cb_arg); +} + +static void +nvme_tcp_icresp_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_ic_resp *ic_resp = &pdu->hdr.ic_resp; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + int recv_buf_size; + + /* Only PFV 0 is defined currently */ + if (ic_resp->pfv != 0) { + SPDK_ERRLOG("Expected ICResp PFV %u, got %u\n", 0u, ic_resp->pfv); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, pfv); + goto end; + } + + if (ic_resp->maxh2cdata < NVME_TCP_PDU_H2C_MIN_DATA_SIZE) { + SPDK_ERRLOG("Expected ICResp maxh2cdata >=%u, got %u\n", NVME_TCP_PDU_H2C_MIN_DATA_SIZE, + ic_resp->maxh2cdata); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, maxh2cdata); + goto end; + } + tqpair->maxh2cdata = ic_resp->maxh2cdata; + + if (ic_resp->cpda > SPDK_NVME_TCP_CPDA_MAX) { + SPDK_ERRLOG("Expected ICResp cpda <=%u, got %u\n", SPDK_NVME_TCP_CPDA_MAX, ic_resp->cpda); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_ic_resp, cpda); + goto end; + } + tqpair->cpda = ic_resp->cpda; + + tqpair->host_hdgst_enable = ic_resp->dgst.bits.hdgst_enable ? true : false; + tqpair->host_ddgst_enable = ic_resp->dgst.bits.ddgst_enable ? true : false; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_hdgst_enable: %u\n", tqpair->host_hdgst_enable); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "host_ddgst_enable: %u\n", tqpair->host_ddgst_enable); + + /* Now that we know whether digests are enabled, properly size the receive buffer to + * handle several incoming 4K read commands according to SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR + * parameter. */ + recv_buf_size = 0x1000 + sizeof(struct spdk_nvme_tcp_c2h_data_hdr); + + if (tqpair->host_hdgst_enable) { + recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; + } + + if (tqpair->host_ddgst_enable) { + recv_buf_size += SPDK_NVME_TCP_DIGEST_LEN; + } + + if (spdk_sock_set_recvbuf(tqpair->sock, recv_buf_size * SPDK_NVMF_TCP_RECV_BUF_SIZE_FACTOR) < 0) { + SPDK_WARNLOG("Unable to allocate enough memory for receive buffer on tqpair=%p with size=%d\n", + tqpair, + recv_buf_size); + /* Not fatal. */ + } + + tqpair->state = NVME_TCP_QPAIR_STATE_RUNNING; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + return; +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_capsule_resp_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu, + uint32_t *reaped) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_rsp *capsule_resp = &pdu->hdr.capsule_resp; + uint32_t cid, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + struct spdk_nvme_cpl cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + cpl = capsule_resp->rccqe; + cid = cpl.cid; + + /* Recv the pdu again */ + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + + tcp_req = get_nvme_active_req_by_cid(tqpair, cid); + if (!tcp_req) { + SPDK_ERRLOG("no tcp_req is found with cid=%u for tqpair=%p\n", cid, tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_rsp, rccqe); + goto end; + + } + + nvme_tcp_req_complete(tcp_req, &cpl); + if (tcp_req->ordering.send_ack) { + (*reaped)++; + } + + tcp_req->ordering.data_recv = 1; + nvme_tcp_req_put_safe(tcp_req); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "complete tcp_req(%p) on tqpair=%p\n", tcp_req, tqpair); + + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_c2h_term_req_hdr_handle(struct nvme_tcp_qpair *tqpair, + struct nvme_tcp_pdu *pdu) +{ + struct spdk_nvme_tcp_term_req_hdr *c2h_term_req = &pdu->hdr.term_req; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + if (c2h_term_req->fes > SPDK_NVME_TCP_TERM_REQ_FES_INVALID_DATA_UNSUPPORTED_PARAMETER) { + SPDK_ERRLOG("Fatal Error Stauts(FES) is unknown for c2h_term_req pdu=%p\n", pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_term_req_hdr, fes); + goto end; + } + + /* set the data buffer */ + nvme_tcp_pdu_set_data(pdu, (uint8_t *)pdu->hdr.raw + c2h_term_req->common.hlen, + c2h_term_req->common.plen - c2h_term_req->common.hlen); + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_c2h_data_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_c2h_data_hdr *c2h_data = &pdu->hdr.c2h_data; + uint32_t error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVME, "c2h_data info on tqpair(%p): datao=%u, datal=%u, cccid=%d\n", + tqpair, c2h_data->datao, c2h_data->datal, c2h_data->cccid); + tcp_req = get_nvme_active_req_by_cid(tqpair, c2h_data->cccid); + if (!tcp_req) { + SPDK_ERRLOG("no tcp_req found for c2hdata cid=%d\n", c2h_data->cccid); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, cccid); + goto end; + + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "tcp_req(%p) on tqpair(%p): datao=%u, payload_size=%u\n", + tcp_req, tqpair, tcp_req->datao, tcp_req->req->payload_size); + + if (c2h_data->datal > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid datal for tcp_req(%p), datal(%u) exceeds payload_size(%u)\n", + tcp_req, c2h_data->datal, tcp_req->req->payload_size); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + goto end; + } + + if (tcp_req->datao != c2h_data->datao) { + SPDK_ERRLOG("Invalid datao for tcp_req(%p), received datal(%u) != datao(%u) in tcp_req\n", + tcp_req, c2h_data->datao, tcp_req->datao); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datao); + goto end; + } + + if ((c2h_data->datao + c2h_data->datal) > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid data range for tcp_req(%p), received (datao(%u) + datal(%u)) > datao(%u) in tcp_req\n", + tcp_req, c2h_data->datao, c2h_data->datal, tcp_req->req->payload_size); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + error_offset = offsetof(struct spdk_nvme_tcp_c2h_data_hdr, datal); + goto end; + + } + + nvme_tcp_pdu_set_data_buf(pdu, tcp_req->iov, tcp_req->iovcnt, + c2h_data->datao, c2h_data->datal); + pdu->req = tcp_req; + + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD); + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; +} + +static void +nvme_tcp_qpair_h2c_data_send_complete(void *cb_arg) +{ + struct nvme_tcp_req *tcp_req = cb_arg; + + assert(tcp_req != NULL); + + tcp_req->ordering.send_ack = 1; + if (tcp_req->r2tl_remain) { + nvme_tcp_send_h2c_data(tcp_req); + } else { + assert(tcp_req->active_r2ts > 0); + tcp_req->active_r2ts--; + tcp_req->state = NVME_TCP_REQ_ACTIVE; + /* Need also call this function to free the resource */ + nvme_tcp_req_put_safe(tcp_req); + } +} + +static void +nvme_tcp_send_h2c_data(struct nvme_tcp_req *tcp_req) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(tcp_req->req->qpair); + struct nvme_tcp_pdu *rsp_pdu; + struct spdk_nvme_tcp_h2c_data_hdr *h2c_data; + uint32_t plen, pdo, alignment; + + /* Reinit the send_ack and r2t_recv bits */ + tcp_req->ordering.send_ack = 0; + tcp_req->ordering.r2t_recv = 0; + rsp_pdu = tcp_req->send_pdu; + memset(rsp_pdu, 0, sizeof(*rsp_pdu)); + h2c_data = &rsp_pdu->hdr.h2c_data; + + h2c_data->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_H2C_DATA; + plen = h2c_data->common.hlen = sizeof(*h2c_data); + h2c_data->cccid = tcp_req->cid; + h2c_data->ttag = tcp_req->ttag; + h2c_data->datao = tcp_req->datao; + + h2c_data->datal = spdk_min(tcp_req->r2tl_remain, tqpair->maxh2cdata); + nvme_tcp_pdu_set_data_buf(rsp_pdu, tcp_req->iov, tcp_req->iovcnt, + h2c_data->datao, h2c_data->datal); + tcp_req->r2tl_remain -= h2c_data->datal; + + if (tqpair->host_hdgst_enable) { + h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_HDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + rsp_pdu->padding_len = 0; + pdo = plen; + if (tqpair->cpda) { + alignment = (tqpair->cpda + 1) << 2; + if (alignment > plen) { + rsp_pdu->padding_len = alignment - plen; + pdo = plen = alignment; + } + } + + h2c_data->common.pdo = pdo; + plen += h2c_data->datal; + if (tqpair->host_ddgst_enable) { + h2c_data->common.flags |= SPDK_NVME_TCP_CH_FLAGS_DDGSTF; + plen += SPDK_NVME_TCP_DIGEST_LEN; + } + + h2c_data->common.plen = plen; + tcp_req->datao += h2c_data->datal; + if (!tcp_req->r2tl_remain) { + h2c_data->common.flags |= SPDK_NVME_TCP_H2C_DATA_FLAGS_LAST_PDU; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "h2c_data info: datao=%u, datal=%u, pdu_len=%u for tqpair=%p\n", + h2c_data->datao, h2c_data->datal, h2c_data->common.plen, tqpair); + + nvme_tcp_qpair_write_pdu(tqpair, rsp_pdu, nvme_tcp_qpair_h2c_data_send_complete, tcp_req); +} + +static void +nvme_tcp_r2t_hdr_handle(struct nvme_tcp_qpair *tqpair, struct nvme_tcp_pdu *pdu) +{ + struct nvme_tcp_req *tcp_req; + struct spdk_nvme_tcp_r2t_hdr *r2t = &pdu->hdr.r2t; + uint32_t cid, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter\n"); + cid = r2t->cccid; + tcp_req = get_nvme_active_req_by_cid(tqpair, cid); + if (!tcp_req) { + SPDK_ERRLOG("Cannot find tcp_req for tqpair=%p\n", tqpair); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, cccid); + goto end; + } + + tcp_req->ordering.r2t_recv = 1; + SPDK_DEBUGLOG(SPDK_LOG_NVME, "r2t info: r2to=%u, r2tl=%u for tqpair=%p\n", r2t->r2to, r2t->r2tl, + tqpair); + + if (tcp_req->state == NVME_TCP_REQ_ACTIVE) { + assert(tcp_req->active_r2ts == 0); + tcp_req->state = NVME_TCP_REQ_ACTIVE_R2T; + } + + tcp_req->active_r2ts++; + if (tcp_req->active_r2ts > tqpair->maxr2t) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_R2T_LIMIT_EXCEEDED; + SPDK_ERRLOG("Invalid R2T: it exceeds the R2T maixmal=%u for tqpair=%p\n", tqpair->maxr2t, tqpair); + goto end; + } + + if (tcp_req->datao != r2t->r2to) { + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2to); + goto end; + + } + + if ((r2t->r2tl + r2t->r2to) > tcp_req->req->payload_size) { + SPDK_ERRLOG("Invalid R2T info for tcp_req=%p: (r2to(%u) + r2tl(%u)) exceeds payload_size(%u)\n", + tcp_req, r2t->r2to, r2t->r2tl, tqpair->maxh2cdata); + fes = SPDK_NVME_TCP_TERM_REQ_FES_DATA_TRANSFER_OUT_OF_RANGE; + error_offset = offsetof(struct spdk_nvme_tcp_r2t_hdr, r2tl); + goto end; + + } + + tcp_req->ttag = r2t->ttag; + tcp_req->r2tl_remain = r2t->r2tl; + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + + if (spdk_likely(tcp_req->ordering.send_ack)) { + nvme_tcp_send_h2c_data(tcp_req); + } + return; + +end: + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + +} + +static void +nvme_tcp_pdu_psh_handle(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) +{ + struct nvme_tcp_pdu *pdu; + int rc; + uint32_t crc32c, error_offset = 0; + enum spdk_nvme_tcp_term_req_fes fes; + + assert(tqpair->recv_state == NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH); + pdu = &tqpair->recv_pdu; + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "enter: pdu type =%u\n", pdu->hdr.common.pdu_type); + /* check header digest if needed */ + if (pdu->has_hdgst) { + crc32c = nvme_tcp_pdu_calc_header_digest(pdu); + rc = MATCH_DIGEST_WORD((uint8_t *)pdu->hdr.raw + pdu->hdr.common.hlen, crc32c); + if (rc == 0) { + SPDK_ERRLOG("header digest error on tqpair=(%p) with pdu=%p\n", tqpair, pdu); + fes = SPDK_NVME_TCP_TERM_REQ_FES_HDGST_ERROR; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + return; + + } + } + + switch (pdu->hdr.common.pdu_type) { + case SPDK_NVME_TCP_PDU_TYPE_IC_RESP: + nvme_tcp_icresp_handle(tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_CAPSULE_RESP: + nvme_tcp_capsule_resp_hdr_handle(tqpair, pdu, reaped); + break; + case SPDK_NVME_TCP_PDU_TYPE_C2H_DATA: + nvme_tcp_c2h_data_hdr_handle(tqpair, pdu); + break; + + case SPDK_NVME_TCP_PDU_TYPE_C2H_TERM_REQ: + nvme_tcp_c2h_term_req_hdr_handle(tqpair, pdu); + break; + case SPDK_NVME_TCP_PDU_TYPE_R2T: + nvme_tcp_r2t_hdr_handle(tqpair, pdu); + break; + + default: + SPDK_ERRLOG("Unexpected PDU type 0x%02x\n", tqpair->recv_pdu.hdr.common.pdu_type); + fes = SPDK_NVME_TCP_TERM_REQ_FES_INVALID_HEADER_FIELD; + error_offset = 1; + nvme_tcp_qpair_send_h2c_term_req(tqpair, pdu, fes, error_offset); + break; + } + +} + +static int +nvme_tcp_read_pdu(struct nvme_tcp_qpair *tqpair, uint32_t *reaped) +{ + int rc = 0; + struct nvme_tcp_pdu *pdu; + uint32_t data_len; + enum nvme_tcp_pdu_recv_state prev_state; + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = tqpair->recv_state; + switch (tqpair->recv_state) { + /* If in a new state */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY: + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH); + break; + /* common header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_CH: + pdu = &tqpair->recv_pdu; + if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { + rc = nvme_tcp_read_data(tqpair->sock, + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) - pdu->ch_valid_bytes, + (uint8_t *)&pdu->hdr.common + pdu->ch_valid_bytes); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + pdu->ch_valid_bytes += rc; + if (pdu->ch_valid_bytes < sizeof(struct spdk_nvme_tcp_common_pdu_hdr)) { + return NVME_TCP_PDU_IN_PROGRESS; + } + } + + /* The command header of this PDU has now been read from the socket. */ + nvme_tcp_pdu_ch_handle(tqpair); + break; + /* Wait for the pdu specific header */ + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PSH: + pdu = &tqpair->recv_pdu; + rc = nvme_tcp_read_data(tqpair->sock, + pdu->psh_len - pdu->psh_valid_bytes, + (uint8_t *)&pdu->hdr.raw + sizeof(struct spdk_nvme_tcp_common_pdu_hdr) + pdu->psh_valid_bytes); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + + pdu->psh_valid_bytes += rc; + if (pdu->psh_valid_bytes < pdu->psh_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + /* All header(ch, psh, head digist) of this PDU has now been read from the socket. */ + nvme_tcp_pdu_psh_handle(tqpair, reaped); + break; + case NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_PAYLOAD: + pdu = &tqpair->recv_pdu; + /* check whether the data is valid, if not we just return */ + if (!pdu->data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + data_len = pdu->data_len; + /* data digest */ + if (spdk_unlikely((pdu->hdr.common.pdu_type == SPDK_NVME_TCP_PDU_TYPE_C2H_DATA) && + tqpair->host_ddgst_enable)) { + data_len += SPDK_NVME_TCP_DIGEST_LEN; + pdu->ddgst_enable = true; + } + + rc = nvme_tcp_read_payload_data(tqpair->sock, pdu); + if (rc < 0) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_ERROR); + break; + } + + pdu->readv_offset += rc; + if (pdu->readv_offset < data_len) { + return NVME_TCP_PDU_IN_PROGRESS; + } + + assert(pdu->readv_offset == data_len); + /* All of this PDU has now been read from the socket. */ + nvme_tcp_pdu_payload_handle(tqpair, reaped); + break; + case NVME_TCP_PDU_RECV_STATE_ERROR: + rc = NVME_TCP_PDU_FATAL; + break; + default: + assert(0); + break; + } + } while (prev_state != tqpair->recv_state); + + return rc; +} + +static void +nvme_tcp_qpair_check_timeout(struct spdk_nvme_qpair *qpair) +{ + uint64_t t02; + struct nvme_tcp_req *tcp_req, *tmp; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvme_ctrlr_process *active_proc; + + /* Don't check timeouts during controller initialization. */ + if (ctrlr->state != NVME_CTRLR_STATE_READY) { + return; + } + + if (nvme_qpair_is_admin_queue(qpair)) { + active_proc = nvme_ctrlr_get_current_process(ctrlr); + } else { + active_proc = qpair->active_proc; + } + + /* Only check timeouts if the current process has a timeout callback. */ + if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) { + return; + } + + t02 = spdk_get_ticks(); + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + + if (nvme_request_check_timeout(tcp_req->req, tcp_req->cid, active_proc, t02)) { + /* + * The requests are in order, so as soon as one has not timed out, + * stop iterating. + */ + break; + } + } +} + +static int +nvme_tcp_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + uint32_t reaped; + int rc; + + rc = spdk_sock_flush(tqpair->sock); + if (rc < 0) { + return rc; + } + + if (max_completions == 0) { + max_completions = tqpair->num_entries; + } else { + max_completions = spdk_min(max_completions, tqpair->num_entries); + } + + reaped = 0; + do { + rc = nvme_tcp_read_pdu(tqpair, &reaped); + if (rc < 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + goto fail; + } else if (rc == 0) { + /* Partial PDU is read */ + break; + } + + } while (reaped < max_completions); + + if (spdk_unlikely(tqpair->qpair.ctrlr->timeout_enabled)) { + nvme_tcp_qpair_check_timeout(qpair); + } + + return reaped; +fail: + + /* + * Since admin queues take the ctrlr_lock before entering this function, + * we can call nvme_transport_ctrlr_disconnect_qpair. For other qpairs we need + * to call the generic function which will take the lock for us. + */ + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_UNKNOWN; + + if (nvme_qpair_is_admin_queue(qpair)) { + nvme_transport_ctrlr_disconnect_qpair(qpair->ctrlr, qpair); + } else { + nvme_ctrlr_disconnect_qpair(qpair); + } + return -ENXIO; +} + +static void +nvme_tcp_qpair_sock_cb(void *ctx, struct spdk_sock_group *group, struct spdk_sock *sock) +{ + struct spdk_nvme_qpair *qpair = ctx; + struct nvme_tcp_poll_group *pgroup = nvme_tcp_poll_group(qpair->poll_group); + int32_t num_completions; + + num_completions = spdk_nvme_qpair_process_completions(qpair, pgroup->completions_per_qpair); + + if (pgroup->num_completions >= 0 && num_completions >= 0) { + pgroup->num_completions += num_completions; + } else { + pgroup->num_completions = -ENXIO; + } +} + +static int +nvme_tcp_qpair_icreq_send(struct nvme_tcp_qpair *tqpair) +{ + struct spdk_nvme_tcp_ic_req *ic_req; + struct nvme_tcp_pdu *pdu; + uint64_t icreq_timeout_tsc; + int rc; + + pdu = &tqpair->send_pdu; + memset(&tqpair->send_pdu, 0, sizeof(tqpair->send_pdu)); + ic_req = &pdu->hdr.ic_req; + + ic_req->common.pdu_type = SPDK_NVME_TCP_PDU_TYPE_IC_REQ; + ic_req->common.hlen = ic_req->common.plen = sizeof(*ic_req); + ic_req->pfv = 0; + ic_req->maxr2t = NVME_TCP_MAX_R2T_DEFAULT - 1; + ic_req->hpda = NVME_TCP_HPDA_DEFAULT; + + ic_req->dgst.bits.hdgst_enable = tqpair->qpair.ctrlr->opts.header_digest; + ic_req->dgst.bits.ddgst_enable = tqpair->qpair.ctrlr->opts.data_digest; + + nvme_tcp_qpair_write_pdu(tqpair, pdu, nvme_tcp_send_icreq_complete, tqpair); + + icreq_timeout_tsc = spdk_get_ticks() + (NVME_TCP_TIME_OUT_IN_SECONDS * spdk_get_ticks_hz()); + do { + rc = nvme_tcp_qpair_process_completions(&tqpair->qpair, 0); + } while ((tqpair->state == NVME_TCP_QPAIR_STATE_INVALID) && + (rc == 0) && (spdk_get_ticks() <= icreq_timeout_tsc)); + + if (tqpair->state != NVME_TCP_QPAIR_STATE_RUNNING) { + SPDK_ERRLOG("Failed to construct the tqpair=%p via correct icresp\n", tqpair); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Succesfully construct the tqpair=%p via correct icresp\n", tqpair); + + return 0; +} + +static int +nvme_tcp_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + int rc; + struct nvme_tcp_qpair *tqpair; + int family; + long int port; + struct spdk_sock_opts opts; + + tqpair = nvme_tcp_qpair(qpair); + + switch (ctrlr->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family); + + memset(&dst_addr, 0, sizeof(dst_addr)); + + SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid); + rc = nvme_tcp_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid); + if (rc != 0) { + SPDK_ERRLOG("dst_addr nvme_tcp_parse_addr() failed\n"); + return -1; + } + + if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) { + memset(&src_addr, 0, sizeof(src_addr)); + rc = nvme_tcp_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid); + if (rc != 0) { + SPDK_ERRLOG("src_addr nvme_tcp_parse_addr() failed\n"); + return -1; + } + } + + port = spdk_strtol(ctrlr->trid.trsvcid, 10); + if (port <= 0 || port >= INT_MAX) { + SPDK_ERRLOG("Invalid port: %s\n", ctrlr->trid.trsvcid); + return -1; + } + + opts.opts_size = sizeof(opts); + spdk_sock_get_default_opts(&opts); + opts.priority = ctrlr->trid.priority; + tqpair->sock = spdk_sock_connect_ext(ctrlr->trid.traddr, port, NULL, &opts); + if (!tqpair->sock) { + SPDK_ERRLOG("sock connection error of tqpair=%p with addr=%s, port=%ld\n", + tqpair, ctrlr->trid.traddr, port); + return -1; + } + + tqpair->maxr2t = NVME_TCP_MAX_R2T_DEFAULT; + /* Explicitly set the state and recv_state of tqpair */ + tqpair->state = NVME_TCP_QPAIR_STATE_INVALID; + if (tqpair->recv_state != NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY) { + nvme_tcp_qpair_set_recv_state(tqpair, NVME_TCP_PDU_RECV_STATE_AWAIT_PDU_READY); + } + rc = nvme_tcp_qpair_icreq_send(tqpair); + if (rc != 0) { + SPDK_ERRLOG("Unable to connect the tqpair\n"); + return -1; + } + + rc = nvme_fabric_qpair_connect(&tqpair->qpair, tqpair->num_entries); + if (rc < 0) { + SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n"); + return -1; + } + + return 0; +} + +static struct spdk_nvme_qpair * +nvme_tcp_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr, + uint16_t qid, uint32_t qsize, + enum spdk_nvme_qprio qprio, + uint32_t num_requests) +{ + struct nvme_tcp_qpair *tqpair; + struct spdk_nvme_qpair *qpair; + int rc; + + tqpair = calloc(1, sizeof(struct nvme_tcp_qpair)); + if (!tqpair) { + SPDK_ERRLOG("failed to get create tqpair\n"); + return NULL; + } + + tqpair->num_entries = qsize; + qpair = &tqpair->qpair; + rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests); + if (rc != 0) { + free(tqpair); + return NULL; + } + + rc = nvme_tcp_alloc_reqs(tqpair); + if (rc) { + nvme_tcp_ctrlr_delete_io_qpair(ctrlr, qpair); + return NULL; + } + + return qpair; +} + +static struct spdk_nvme_qpair * +nvme_tcp_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + return nvme_tcp_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio, + opts->io_queue_requests); +} + +static struct spdk_nvme_ctrlr *nvme_tcp_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + struct nvme_tcp_ctrlr *tctrlr; + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + int rc; + + tctrlr = calloc(1, sizeof(*tctrlr)); + if (tctrlr == NULL) { + SPDK_ERRLOG("could not allocate ctrlr\n"); + return NULL; + } + + tctrlr->ctrlr.opts = *opts; + tctrlr->ctrlr.trid = *trid; + + rc = nvme_ctrlr_construct(&tctrlr->ctrlr); + if (rc != 0) { + free(tctrlr); + return NULL; + } + + tctrlr->ctrlr.adminq = nvme_tcp_ctrlr_create_qpair(&tctrlr->ctrlr, 0, + tctrlr->ctrlr.opts.admin_queue_size, 0, + tctrlr->ctrlr.opts.admin_queue_size); + if (!tctrlr->ctrlr.adminq) { + SPDK_ERRLOG("failed to create admin qpair\n"); + nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + rc = nvme_transport_ctrlr_connect_qpair(&tctrlr->ctrlr, tctrlr->ctrlr.adminq); + if (rc < 0) { + SPDK_ERRLOG("failed to connect admin qpair\n"); + nvme_tcp_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_cap(&tctrlr->ctrlr, &cap)) { + SPDK_ERRLOG("get_cap() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_get_vs(&tctrlr->ctrlr, &vs)) { + SPDK_ERRLOG("get_vs() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + if (nvme_ctrlr_add_process(&tctrlr->ctrlr, 0) != 0) { + SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n"); + nvme_ctrlr_destruct(&tctrlr->ctrlr); + return NULL; + } + + nvme_ctrlr_init_cap(&tctrlr->ctrlr, &cap, &vs); + + return &tctrlr->ctrlr; +} + +static uint32_t +nvme_tcp_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + /* TCP transport doens't limit maximum IO transfer size. */ + return UINT32_MAX; +} + +static uint16_t +nvme_tcp_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + /* + * We do not support >1 SGE in the initiator currently, + * so we can only return 1 here. Once that support is + * added, this should return ctrlr->cdata.nvmf_specific.msdbd + * instead. + */ + return 1; +} + +static int +nvme_tcp_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_req *tcp_req, *tmp; + int rc; + + assert(iter_fn != NULL); + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + + rc = iter_fn(tcp_req->req, arg); + if (rc != 0) { + return rc; + } + } + + return 0; +} + +static void +nvme_tcp_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_req *tcp_req, *tmp; + struct spdk_nvme_cpl cpl; + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + cpl.status.sc = SPDK_NVME_SC_ABORTED_SQ_DELETION; + cpl.status.sct = SPDK_NVME_SCT_GENERIC; + + TAILQ_FOREACH_SAFE(tcp_req, &tqpair->outstanding_reqs, link, tmp) { + assert(tcp_req->req != NULL); + if (tcp_req->req->cmd.opc != SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) { + continue; + } + + nvme_tcp_req_complete(tcp_req, &cpl); + nvme_tcp_req_put(tqpair, tcp_req); + } +} + +static struct spdk_nvme_transport_poll_group * +nvme_tcp_poll_group_create(void) +{ + struct nvme_tcp_poll_group *group = calloc(1, sizeof(*group)); + + if (group == NULL) { + SPDK_ERRLOG("Unable to allocate poll group.\n"); + return NULL; + } + + group->sock_group = spdk_sock_group_create(group); + if (group->sock_group == NULL) { + free(group); + SPDK_ERRLOG("Unable to allocate sock group.\n"); + return NULL; + } + + return &group->group; +} + +static int +nvme_tcp_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { + return -EPROTO; + } + return 0; +} + +static int +nvme_tcp_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(qpair->poll_group); + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + + if (tqpair->sock && group->sock_group) { + if (spdk_sock_group_remove_sock(group->sock_group, tqpair->sock)) { + return -EPROTO; + } + } + return 0; +} + +static int +nvme_tcp_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + struct nvme_tcp_qpair *tqpair = nvme_tcp_qpair(qpair); + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + + /* disconnected qpairs won't have a sock to add. */ + if (nvme_qpair_get_state(qpair) >= NVME_QPAIR_CONNECTED) { + if (spdk_sock_group_add_sock(group->sock_group, tqpair->sock, nvme_tcp_qpair_sock_cb, qpair)) { + return -EPROTO; + } + } + + return 0; +} + +static int +nvme_tcp_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return nvme_poll_group_disconnect_qpair(qpair); + } + + return 0; +} + +static int64_t +nvme_tcp_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + struct spdk_nvme_qpair *qpair, *tmp_qpair; + + group->completions_per_qpair = completions_per_qpair; + group->num_completions = 0; + + spdk_sock_group_poll(group->sock_group); + + STAILQ_FOREACH_SAFE(qpair, &tgroup->disconnected_qpairs, poll_group_stailq, tmp_qpair) { + disconnected_qpair_cb(qpair, tgroup->group->ctx); + } + + return group->num_completions; +} + +static int +nvme_tcp_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + int rc; + struct nvme_tcp_poll_group *group = nvme_tcp_poll_group(tgroup); + + if (!STAILQ_EMPTY(&tgroup->connected_qpairs) || !STAILQ_EMPTY(&tgroup->disconnected_qpairs)) { + return -EBUSY; + } + + rc = spdk_sock_group_close(&group->sock_group); + if (rc != 0) { + SPDK_ERRLOG("Failed to close the sock group for a tcp poll group.\n"); + assert(false); + } + + free(tgroup); + + return 0; +} + +const struct spdk_nvme_transport_ops tcp_ops = { + .name = "TCP", + .type = SPDK_NVME_TRANSPORT_TCP, + .ctrlr_construct = nvme_tcp_ctrlr_construct, + .ctrlr_scan = nvme_fabric_ctrlr_scan, + .ctrlr_destruct = nvme_tcp_ctrlr_destruct, + .ctrlr_enable = nvme_tcp_ctrlr_enable, + + .ctrlr_set_reg_4 = nvme_fabric_ctrlr_set_reg_4, + .ctrlr_set_reg_8 = nvme_fabric_ctrlr_set_reg_8, + .ctrlr_get_reg_4 = nvme_fabric_ctrlr_get_reg_4, + .ctrlr_get_reg_8 = nvme_fabric_ctrlr_get_reg_8, + + .ctrlr_get_max_xfer_size = nvme_tcp_ctrlr_get_max_xfer_size, + .ctrlr_get_max_sges = nvme_tcp_ctrlr_get_max_sges, + + .ctrlr_create_io_qpair = nvme_tcp_ctrlr_create_io_qpair, + .ctrlr_delete_io_qpair = nvme_tcp_ctrlr_delete_io_qpair, + .ctrlr_connect_qpair = nvme_tcp_ctrlr_connect_qpair, + .ctrlr_disconnect_qpair = nvme_tcp_ctrlr_disconnect_qpair, + + .qpair_abort_reqs = nvme_tcp_qpair_abort_reqs, + .qpair_reset = nvme_tcp_qpair_reset, + .qpair_submit_request = nvme_tcp_qpair_submit_request, + .qpair_process_completions = nvme_tcp_qpair_process_completions, + .qpair_iterate_requests = nvme_tcp_qpair_iterate_requests, + .admin_qpair_abort_aers = nvme_tcp_admin_qpair_abort_aers, + + .poll_group_create = nvme_tcp_poll_group_create, + .poll_group_connect_qpair = nvme_tcp_poll_group_connect_qpair, + .poll_group_disconnect_qpair = nvme_tcp_poll_group_disconnect_qpair, + .poll_group_add = nvme_tcp_poll_group_add, + .poll_group_remove = nvme_tcp_poll_group_remove, + .poll_group_process_completions = nvme_tcp_poll_group_process_completions, + .poll_group_destroy = nvme_tcp_poll_group_destroy, +}; + +SPDK_NVME_TRANSPORT_REGISTER(tcp, &tcp_ops); diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c new file mode 100644 index 000000000..76efd5966 --- /dev/null +++ b/src/spdk/lib/nvme/nvme_transport.c @@ -0,0 +1,591 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe transport abstraction + */ + +#include "nvme_internal.h" +#include "spdk/queue.h" + +#define SPDK_MAX_NUM_OF_TRANSPORTS 16 + +struct spdk_nvme_transport { + struct spdk_nvme_transport_ops ops; + TAILQ_ENTRY(spdk_nvme_transport) link; +}; + +TAILQ_HEAD(nvme_transport_list, spdk_nvme_transport) g_spdk_nvme_transports = + TAILQ_HEAD_INITIALIZER(g_spdk_nvme_transports); + +struct spdk_nvme_transport g_spdk_transports[SPDK_MAX_NUM_OF_TRANSPORTS] = {}; +int g_current_transport_index = 0; + +const struct spdk_nvme_transport * +nvme_get_first_transport(void) +{ + return TAILQ_FIRST(&g_spdk_nvme_transports); +} + +const struct spdk_nvme_transport * +nvme_get_next_transport(const struct spdk_nvme_transport *transport) +{ + return TAILQ_NEXT(transport, link); +} + +/* + * Unfortunately, due to NVMe PCIe multiprocess support, we cannot store the + * transport object in either the controller struct or the admin qpair. THis means + * that a lot of admin related transport calls will have to call nvme_get_transport + * in order to knwo which functions to call. + * In the I/O path, we have the ability to store the transport struct in the I/O + * qpairs to avoid taking a performance hit. + */ +const struct spdk_nvme_transport * +nvme_get_transport(const char *transport_name) +{ + struct spdk_nvme_transport *registered_transport; + + TAILQ_FOREACH(registered_transport, &g_spdk_nvme_transports, link) { + if (strcasecmp(transport_name, registered_transport->ops.name) == 0) { + return registered_transport; + } + } + + return NULL; +} + +bool +spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype) +{ + return nvme_get_transport(spdk_nvme_transport_id_trtype_str(trtype)) == NULL ? false : true; +} + +bool +spdk_nvme_transport_available_by_name(const char *transport_name) +{ + return nvme_get_transport(transport_name) == NULL ? false : true; +} + +void spdk_nvme_transport_register(const struct spdk_nvme_transport_ops *ops) +{ + struct spdk_nvme_transport *new_transport; + + if (nvme_get_transport(ops->name)) { + SPDK_ERRLOG("Double registering NVMe transport %s is prohibited.\n", ops->name); + assert(false); + } + + if (g_current_transport_index == SPDK_MAX_NUM_OF_TRANSPORTS) { + SPDK_ERRLOG("Unable to register new NVMe transport.\n"); + assert(false); + return; + } + new_transport = &g_spdk_transports[g_current_transport_index++]; + + new_transport->ops = *ops; + TAILQ_INSERT_TAIL(&g_spdk_nvme_transports, new_transport, link); +} + +struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid, + const struct spdk_nvme_ctrlr_opts *opts, + void *devhandle) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(trid->trstring); + struct spdk_nvme_ctrlr *ctrlr; + + if (transport == NULL) { + SPDK_ERRLOG("Transport %s doesn't exist.", trid->trstring); + return NULL; + } + + ctrlr = transport->ops.ctrlr_construct(trid, opts, devhandle); + + return ctrlr; +} + +int +nvme_transport_ctrlr_scan(struct spdk_nvme_probe_ctx *probe_ctx, + bool direct_connect) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(probe_ctx->trid.trstring); + + if (transport == NULL) { + SPDK_ERRLOG("Transport %s doesn't exist.", probe_ctx->trid.trstring); + return -ENOENT; + } + + return transport->ops.ctrlr_scan(probe_ctx, direct_connect); +} + +int +nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_destruct(ctrlr); +} + +int +nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_enable(ctrlr); +} + +int +nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_set_reg_4(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_set_reg_8(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_reg_4(ctrlr, offset, value); +} + +int +nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_reg_8(ctrlr, offset, value); +} + +uint32_t +nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_max_xfer_size(ctrlr); +} + +uint16_t +nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + return transport->ops.ctrlr_get_max_sges(ctrlr); +} + +int +nvme_transport_ctrlr_reserve_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_reserve_cmb != NULL) { + return transport->ops.ctrlr_reserve_cmb(ctrlr); + } + + return -ENOTSUP; +} + +void * +nvme_transport_ctrlr_map_cmb(struct spdk_nvme_ctrlr *ctrlr, size_t *size) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_map_cmb != NULL) { + return transport->ops.ctrlr_map_cmb(ctrlr, size); + } + + return NULL; +} + +int +nvme_transport_ctrlr_unmap_cmb(struct spdk_nvme_ctrlr *ctrlr) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + if (transport->ops.ctrlr_unmap_cmb != NULL) { + return transport->ops.ctrlr_unmap_cmb(ctrlr); + } + + return 0; +} + +struct spdk_nvme_qpair * +nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, + const struct spdk_nvme_io_qpair_opts *opts) +{ + struct spdk_nvme_qpair *qpair; + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + qpair = transport->ops.ctrlr_create_io_qpair(ctrlr, qid, opts); + if (qpair != NULL && !nvme_qpair_is_admin_queue(qpair)) { + qpair->transport = transport; + } + + return qpair; +} + +int +nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + assert(transport != NULL); + + /* Do not rely on qpair->transport. For multi-process cases, a foreign process may delete + * the IO qpair, in which case the transport object would be invalid (each process has their + * own unique transport objects since they contain function pointers). So we look up the + * transport object in the delete_io_qpair case. + */ + return transport->ops.ctrlr_delete_io_qpair(ctrlr, qpair); +} + +int +nvme_transport_ctrlr_connect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + uint8_t transport_failure_reason; + int rc; + + assert(transport != NULL); + if (!nvme_qpair_is_admin_queue(qpair)) { + qpair->transport = transport; + } + + transport_failure_reason = qpair->transport_failure_reason; + qpair->transport_failure_reason = SPDK_NVME_QPAIR_FAILURE_NONE; + + nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTING); + rc = transport->ops.ctrlr_connect_qpair(ctrlr, qpair); + if (rc != 0) { + goto err; + } + + nvme_qpair_set_state(qpair, NVME_QPAIR_CONNECTED); + if (qpair->poll_group) { + rc = nvme_poll_group_connect_qpair(qpair); + if (rc) { + goto err; + } + } + + return rc; + +err: + /* If the qpair was unable to reconnect, restore the original failure reason. */ + qpair->transport_failure_reason = transport_failure_reason; + nvme_transport_ctrlr_disconnect_qpair(ctrlr, qpair); + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED); + return rc; +} + +void +nvme_transport_ctrlr_disconnect_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(ctrlr->trid.trstring); + + if (nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTING || + nvme_qpair_get_state(qpair) == NVME_QPAIR_DISCONNECTED) { + return; + } + + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTING); + assert(transport != NULL); + if (qpair->poll_group) { + nvme_poll_group_disconnect_qpair(qpair); + } + + transport->ops.ctrlr_disconnect_qpair(ctrlr, qpair); + + nvme_qpair_abort_reqs(qpair, 0); + nvme_qpair_set_state(qpair, NVME_QPAIR_DISCONNECTED); +} + +void +nvme_transport_qpair_abort_reqs(struct spdk_nvme_qpair *qpair, uint32_t dnr) +{ + const struct spdk_nvme_transport *transport; + + assert(dnr <= 1); + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + qpair->transport->ops.qpair_abort_reqs(qpair, dnr); + } else { + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + transport->ops.qpair_abort_reqs(qpair, dnr); + } +} + +int +nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_reset(qpair); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_reset(qpair); +} + +int +nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_submit_request(qpair, req); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_submit_request(qpair, req); +} + +int32_t +nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_process_completions(qpair, max_completions); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_process_completions(qpair, max_completions); +} + +int +nvme_transport_qpair_iterate_requests(struct spdk_nvme_qpair *qpair, + int (*iter_fn)(struct nvme_request *req, void *arg), + void *arg) +{ + const struct spdk_nvme_transport *transport; + + if (spdk_likely(!nvme_qpair_is_admin_queue(qpair))) { + return qpair->transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); + } + + transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + assert(transport != NULL); + return transport->ops.qpair_iterate_requests(qpair, iter_fn, arg); +} + +void +nvme_transport_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair) +{ + const struct spdk_nvme_transport *transport = nvme_get_transport(qpair->ctrlr->trid.trstring); + + assert(transport != NULL); + transport->ops.admin_qpair_abort_aers(qpair); +} + +struct spdk_nvme_transport_poll_group * +nvme_transport_poll_group_create(const struct spdk_nvme_transport *transport) +{ + struct spdk_nvme_transport_poll_group *group = NULL; + + group = transport->ops.poll_group_create(); + if (group) { + group->transport = transport; + STAILQ_INIT(&group->connected_qpairs); + STAILQ_INIT(&group->disconnected_qpairs); + } + + return group; +} + +int +nvme_transport_poll_group_add(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + int rc; + + rc = tgroup->transport->ops.poll_group_add(tgroup, qpair); + if (rc == 0) { + qpair->poll_group = tgroup; + assert(nvme_qpair_get_state(qpair) < NVME_QPAIR_CONNECTED); + qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs; + STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq); + } + + return rc; +} + +int +nvme_transport_poll_group_remove(struct spdk_nvme_transport_poll_group *tgroup, + struct spdk_nvme_qpair *qpair) +{ + int rc; + + rc = tgroup->transport->ops.poll_group_remove(tgroup, qpair); + if (rc == 0) { + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + } else if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + } else { + return -ENOENT; + } + + qpair->poll_group = NULL; + qpair->poll_group_tailq_head = NULL; + } + + return rc; +} + +int64_t +nvme_transport_poll_group_process_completions(struct spdk_nvme_transport_poll_group *tgroup, + uint32_t completions_per_qpair, spdk_nvme_disconnected_qpair_cb disconnected_qpair_cb) +{ + struct spdk_nvme_qpair *qpair; + int64_t rc; + + tgroup->in_completion_context = true; + rc = tgroup->transport->ops.poll_group_process_completions(tgroup, completions_per_qpair, + disconnected_qpair_cb); + tgroup->in_completion_context = false; + + if (spdk_unlikely(tgroup->num_qpairs_to_delete > 0)) { + /* deleted qpairs are more likely to be in the disconnected qpairs list. */ + STAILQ_FOREACH(qpair, &tgroup->disconnected_qpairs, poll_group_stailq) { + if (spdk_unlikely(qpair->delete_after_completion_context)) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + if (--tgroup->num_qpairs_to_delete == 0) { + return rc; + } + } + } + + STAILQ_FOREACH(qpair, &tgroup->connected_qpairs, poll_group_stailq) { + if (spdk_unlikely(qpair->delete_after_completion_context)) { + spdk_nvme_ctrlr_free_io_qpair(qpair); + if (--tgroup->num_qpairs_to_delete == 0) { + return rc; + } + } + } + /* Just in case. */ + SPDK_DEBUGLOG(SPDK_LOG_NVME, "Mismatch between qpairs to delete and poll group number.\n"); + tgroup->num_qpairs_to_delete = 0; + } + + return rc; +} + +int +nvme_transport_poll_group_destroy(struct spdk_nvme_transport_poll_group *tgroup) +{ + return tgroup->transport->ops.poll_group_destroy(tgroup); +} + +int +nvme_transport_poll_group_disconnect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int rc; + + tgroup = qpair->poll_group; + + if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + return 0; + } + + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + rc = tgroup->transport->ops.poll_group_disconnect_qpair(qpair); + if (rc == 0) { + qpair->poll_group_tailq_head = &tgroup->disconnected_qpairs; + STAILQ_REMOVE(&tgroup->connected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + STAILQ_INSERT_TAIL(&tgroup->disconnected_qpairs, qpair, poll_group_stailq); + /* EINPROGRESS indicates that a call has already been made to this function. + * It just keeps us from segfaulting on a double removal/insert. + */ + } else if (rc == -EINPROGRESS) { + rc = 0; + } + return rc; + } + + return -EINVAL; +} + +int +nvme_transport_poll_group_connect_qpair(struct spdk_nvme_qpair *qpair) +{ + struct spdk_nvme_transport_poll_group *tgroup; + int rc; + + tgroup = qpair->poll_group; + + if (qpair->poll_group_tailq_head == &tgroup->connected_qpairs) { + return 0; + } + + if (qpair->poll_group_tailq_head == &tgroup->disconnected_qpairs) { + rc = tgroup->transport->ops.poll_group_connect_qpair(qpair); + if (rc == 0) { + qpair->poll_group_tailq_head = &tgroup->connected_qpairs; + STAILQ_REMOVE(&tgroup->disconnected_qpairs, qpair, spdk_nvme_qpair, poll_group_stailq); + STAILQ_INSERT_TAIL(&tgroup->connected_qpairs, qpair, poll_group_stailq); + } + + return rc == -EINPROGRESS ? 0 : rc; + } + + + return -EINVAL; +} diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c new file mode 100644 index 000000000..1bcfff1cb --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.c @@ -0,0 +1,213 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" +#include "spdk/string.h" + +#include "spdk/log.h" + +#include "nvme_uevent.h" + +#ifdef __linux__ + +#include <linux/netlink.h> + +#define SPDK_UEVENT_MSG_LEN 4096 + +int +nvme_uevent_connect(void) +{ + struct sockaddr_nl addr; + int netlink_fd; + int size = 64 * 1024; + int flag; + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = getpid(); + addr.nl_groups = 0xffffffff; + + netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT); + if (netlink_fd < 0) { + return -1; + } + + setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size)); + + flag = fcntl(netlink_fd, F_GETFL); + if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd, + spdk_strerror(errno)); + close(netlink_fd); + return -1; + } + + if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { + close(netlink_fd); + return -1; + } + return netlink_fd; +} + +/* Note: We only parse the event from uio subsystem and will ignore + * all the event from other subsystem. the event from uio subsystem + * as below: + * action: "add" or "remove" + * subsystem: "uio" + * dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0" + */ +static int +parse_event(const char *buf, struct spdk_uevent *event) +{ + char action[SPDK_UEVENT_MSG_LEN]; + char subsystem[SPDK_UEVENT_MSG_LEN]; + char dev_path[SPDK_UEVENT_MSG_LEN]; + char driver[SPDK_UEVENT_MSG_LEN]; + char vfio_pci_addr[SPDK_UEVENT_MSG_LEN]; + + memset(action, 0, SPDK_UEVENT_MSG_LEN); + memset(subsystem, 0, SPDK_UEVENT_MSG_LEN); + memset(dev_path, 0, SPDK_UEVENT_MSG_LEN); + memset(driver, 0, SPDK_UEVENT_MSG_LEN); + memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN); + + while (*buf) { + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + snprintf(action, sizeof(action), "%s", buf); + } else if (!strncmp(buf, "DEVPATH=", 8)) { + buf += 8; + snprintf(dev_path, sizeof(dev_path), "%s", buf); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + snprintf(subsystem, sizeof(subsystem), "%s", buf); + } else if (!strncmp(buf, "DRIVER=", 7)) { + buf += 7; + snprintf(driver, sizeof(driver), "%s", buf); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf); + } + while (*buf++) + ; + } + + if (!strncmp(subsystem, "uio", 3)) { + char *pci_address, *tmp; + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO; + if (!strncmp(action, "add", 3)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + tmp = strstr(dev_path, "/uio/"); + + memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path)); + + pci_address = strrchr(dev_path, '/'); + pci_address++; + if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + } + if (!strncmp(driver, "vfio-pci", 8)) { + struct spdk_pci_addr pci_addr; + + event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO; + if (!strncmp(action, "bind", 4)) { + event->action = SPDK_NVME_UEVENT_ADD; + } + if (!strncmp(action, "remove", 6)) { + event->action = SPDK_NVME_UEVENT_REMOVE; + } + if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) { + SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr); + return -1; + } + spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr); + return 1; + + } + return -1; +} + +int +nvme_get_uevent(int fd, struct spdk_uevent *uevent) +{ + int ret; + char buf[SPDK_UEVENT_MSG_LEN]; + + memset(uevent, 0, sizeof(struct spdk_uevent)); + memset(buf, 0, SPDK_UEVENT_MSG_LEN); + + ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT); + if (ret > 0) { + return parse_event(buf, uevent); + } + + if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + return 0; + } else { + SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno)); + return -1; + } + } + + /* connection closed */ + if (ret == 0) { + return -1; + } + return 0; +} + +#else /* Not Linux */ + +int +nvme_uevent_connect(void) +{ + return -1; +} + +int +nvme_get_uevent(int fd, struct spdk_uevent *uevent) +{ + return -1; +} +#endif diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h new file mode 100644 index 000000000..778d73c2a --- /dev/null +++ b/src/spdk/lib/nvme/nvme_uevent.h @@ -0,0 +1,61 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * SPDK uevent + */ + +#include "spdk/env.h" +#include "spdk/nvmf_spec.h" + +#ifndef SPDK_UEVENT_H_ +#define SPDK_UEVENT_H_ + +#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1 +#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2 + +enum spdk_nvme_uevent_action { + SPDK_NVME_UEVENT_ADD = 0, + SPDK_NVME_UEVENT_REMOVE = 1, +}; + +struct spdk_uevent { + enum spdk_nvme_uevent_action action; + int subsystem; + char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1]; +}; + +int nvme_uevent_connect(void); +int nvme_get_uevent(int fd, struct spdk_uevent *uevent); + +#endif /* SPDK_UEVENT_H_ */ diff --git a/src/spdk/lib/nvme/spdk_nvme.map b/src/spdk/lib/nvme/spdk_nvme.map new file mode 100644 index 000000000..63a04eeca --- /dev/null +++ b/src/spdk/lib/nvme/spdk_nvme.map @@ -0,0 +1,185 @@ +{ + global: + + # public functions from nvme.h + spdk_nvme_transport_register; + spdk_nvme_transport_available; + spdk_nvme_transport_available_by_name; + spdk_nvme_transport_id_parse; + spdk_nvme_transport_id_populate_trstring; + spdk_nvme_transport_id_parse_trtype; + spdk_nvme_transport_id_trtype_str; + spdk_nvme_transport_id_adrfam_str; + spdk_nvme_transport_id_parse_adrfam; + spdk_nvme_transport_id_compare; + spdk_nvme_trid_populate_transport; + spdk_nvme_host_id_parse; + + spdk_nvme_prchk_flags_parse; + spdk_nvme_prchk_flags_str; + + spdk_nvme_probe; + spdk_nvme_connect; + spdk_nvme_connect_async; + spdk_nvme_probe_async; + spdk_nvme_probe_poll_async; + spdk_nvme_detach; + + spdk_nvme_ctrlr_is_discovery; + spdk_nvme_ctrlr_get_default_ctrlr_opts; + spdk_nvme_ctrlr_set_trid; + spdk_nvme_ctrlr_reset; + spdk_nvme_ctrlr_fail; + spdk_nvme_ctrlr_is_failed; + spdk_nvme_ctrlr_get_data; + spdk_nvme_ctrlr_get_regs_csts; + spdk_nvme_ctrlr_get_regs_cap; + spdk_nvme_ctrlr_get_regs_vs; + spdk_nvme_ctrlr_get_regs_cmbsz; + spdk_nvme_ctrlr_get_num_ns; + spdk_nvme_ctrlr_get_pci_device; + spdk_nvme_ctrlr_get_max_xfer_size; + spdk_nvme_ctrlr_is_active_ns; + spdk_nvme_ctrlr_get_first_active_ns; + spdk_nvme_ctrlr_get_next_active_ns; + spdk_nvme_ctrlr_is_log_page_supported; + spdk_nvme_ctrlr_is_feature_supported; + spdk_nvme_ctrlr_register_aer_callback; + spdk_nvme_ctrlr_register_timeout_callback; + spdk_nvme_ctrlr_get_default_io_qpair_opts; + spdk_nvme_ctrlr_alloc_io_qpair; + spdk_nvme_ctrlr_connect_io_qpair; + spdk_nvme_ctrlr_disconnect_io_qpair; + spdk_nvme_ctrlr_reconnect_io_qpair; + spdk_nvme_ctrlr_get_admin_qp_failure_reason; + spdk_nvme_ctrlr_free_io_qpair; + spdk_nvme_ctrlr_io_cmd_raw_no_payload_build; + spdk_nvme_ctrlr_cmd_io_raw; + spdk_nvme_ctrlr_cmd_io_raw_with_md; + spdk_nvme_ctrlr_cmd_admin_raw; + spdk_nvme_ctrlr_process_admin_completions; + spdk_nvme_ctrlr_get_ns; + spdk_nvme_ctrlr_cmd_get_log_page; + spdk_nvme_ctrlr_cmd_get_log_page_ext; + spdk_nvme_ctrlr_cmd_abort; + spdk_nvme_ctrlr_cmd_abort_ext; + spdk_nvme_ctrlr_cmd_set_feature; + spdk_nvme_ctrlr_cmd_get_feature; + spdk_nvme_ctrlr_cmd_get_feature_ns; + spdk_nvme_ctrlr_cmd_set_feature_ns; + spdk_nvme_ctrlr_cmd_security_receive; + spdk_nvme_ctrlr_cmd_security_send; + spdk_nvme_ctrlr_security_receive; + spdk_nvme_ctrlr_security_send; + spdk_nvme_ctrlr_get_flags; + spdk_nvme_ctrlr_attach_ns; + spdk_nvme_ctrlr_detach_ns; + spdk_nvme_ctrlr_create_ns; + spdk_nvme_ctrlr_delete_ns; + spdk_nvme_ctrlr_format; + spdk_nvme_ctrlr_update_firmware; + spdk_nvme_ctrlr_get_registers; + spdk_nvme_ctrlr_reserve_cmb; + spdk_nvme_ctrlr_map_cmb; + spdk_nvme_ctrlr_unmap_cmb; + spdk_nvme_ctrlr_get_transport_id; + + spdk_nvme_poll_group_create; + spdk_nvme_poll_group_add; + spdk_nvme_poll_group_remove; + spdk_nvme_poll_group_destroy; + spdk_nvme_poll_group_process_completions; + spdk_nvme_poll_group_get_ctx; + + spdk_nvme_ns_get_data; + spdk_nvme_ns_get_id; + spdk_nvme_ns_get_ctrlr; + spdk_nvme_ns_is_active; + spdk_nvme_ns_get_max_io_xfer_size; + spdk_nvme_ns_get_sector_size; + spdk_nvme_ns_get_extended_sector_size; + spdk_nvme_ns_get_num_sectors; + spdk_nvme_ns_get_size; + spdk_nvme_ns_get_pi_type; + spdk_nvme_ns_get_md_size; + spdk_nvme_ns_supports_extended_lba; + spdk_nvme_ns_supports_compare; + spdk_nvme_ns_get_dealloc_logical_block_read_value; + spdk_nvme_ns_get_optimal_io_boundary; + spdk_nvme_ns_get_uuid; + spdk_nvme_ns_get_flags; + + spdk_nvme_ns_cmd_write; + spdk_nvme_ns_cmd_writev; + spdk_nvme_ns_cmd_writev_with_md; + spdk_nvme_ns_cmd_write_with_md; + spdk_nvme_ns_cmd_write_zeroes; + spdk_nvme_ns_cmd_write_uncorrectable; + spdk_nvme_ns_cmd_read; + spdk_nvme_ns_cmd_readv; + spdk_nvme_ns_cmd_readv_with_md; + spdk_nvme_ns_cmd_read_with_md; + spdk_nvme_ns_cmd_dataset_management; + spdk_nvme_ns_cmd_flush; + spdk_nvme_ns_cmd_reservation_register; + spdk_nvme_ns_cmd_reservation_release; + spdk_nvme_ns_cmd_reservation_acquire; + spdk_nvme_ns_cmd_reservation_report; + spdk_nvme_ns_cmd_compare; + spdk_nvme_ns_cmd_comparev; + spdk_nvme_ns_cmd_comparev_with_md; + spdk_nvme_ns_cmd_compare_with_md; + + spdk_nvme_qpair_process_completions; + spdk_nvme_qpair_get_failure_reason; + spdk_nvme_qpair_add_cmd_error_injection; + spdk_nvme_qpair_remove_cmd_error_injection; + spdk_nvme_qpair_print_command; + spdk_nvme_qpair_print_completion; + spdk_nvme_print_command; + spdk_nvme_print_completion; + + spdk_nvme_cpl_get_status_string; + + spdk_nvme_rdma_init_hooks; + + spdk_nvme_cuse_get_ctrlr_name; + spdk_nvme_cuse_get_ns_name; + spdk_nvme_cuse_register; + spdk_nvme_cuse_unregister; + spdk_nvme_cuse_update_namespaces; + + spdk_nvme_map_prps; + + # public functions from nvme_ocssd.h + spdk_nvme_ctrlr_is_ocssd_supported; + spdk_nvme_ocssd_ctrlr_cmd_geometry; + spdk_nvme_ocssd_ns_cmd_vector_reset; + spdk_nvme_ocssd_ns_cmd_vector_write; + spdk_nvme_ocssd_ns_cmd_vector_write_with_md; + spdk_nvme_ocssd_ns_cmd_vector_read; + spdk_nvme_ocssd_ns_cmd_vector_read_with_md; + spdk_nvme_ocssd_ns_cmd_vector_copy; + + # public functions from opal.h + spdk_opal_dev_construct; + spdk_opal_dev_destruct; + spdk_opal_get_d0_features_info; + spdk_opal_supported; + spdk_opal_cmd_take_ownership; + spdk_opal_cmd_revert_tper; + spdk_opal_cmd_activate_locking_sp; + spdk_opal_cmd_lock_unlock; + spdk_opal_cmd_setup_locking_range; + spdk_opal_cmd_get_max_ranges; + spdk_opal_cmd_get_locking_range_info; + spdk_opal_cmd_enable_user; + spdk_opal_cmd_add_user_to_locking_range; + spdk_opal_cmd_set_new_passwd; + spdk_opal_cmd_erase_locking_range; + spdk_opal_cmd_secure_erase_locking_range; + spdk_opal_get_locking_range_info; + spdk_opal_free_locking_range_info; + + local: *; +}; |