summaryrefslogtreecommitdiffstats
path: root/src/spdk/lib/nvme
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-27 18:24:20 +0000
commit483eb2f56657e8e7f419ab1a4fab8dce9ade8609 (patch)
treee5d88d25d870d5dedacb6bbdbe2a966086a0a5cf /src/spdk/lib/nvme
parentInitial commit. (diff)
downloadceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.tar.xz
ceph-483eb2f56657e8e7f419ab1a4fab8dce9ade8609.zip
Adding upstream version 14.2.21.upstream/14.2.21upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/spdk/lib/nvme')
-rw-r--r--src/spdk/lib/nvme/Makefile61
-rw-r--r--src/spdk/lib/nvme/nvme.c862
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr.c2678
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr_cmd.c694
-rw-r--r--src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c83
-rw-r--r--src/spdk/lib/nvme/nvme_fabric.c340
-rw-r--r--src/spdk/lib/nvme/nvme_internal.h1003
-rw-r--r--src/spdk/lib/nvme/nvme_ns.c360
-rw-r--r--src/spdk/lib/nvme/nvme_ns_cmd.c1026
-rw-r--r--src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c232
-rw-r--r--src/spdk/lib/nvme/nvme_pcie.c2142
-rw-r--r--src/spdk/lib/nvme/nvme_qpair.c663
-rw-r--r--src/spdk/lib/nvme/nvme_quirks.c141
-rw-r--r--src/spdk/lib/nvme/nvme_rdma.c1634
-rw-r--r--src/spdk/lib/nvme/nvme_transport.c219
-rw-r--r--src/spdk/lib/nvme/nvme_uevent.c214
-rw-r--r--src/spdk/lib/nvme/nvme_uevent.h61
17 files changed, 12413 insertions, 0 deletions
diff --git a/src/spdk/lib/nvme/Makefile b/src/spdk/lib/nvme/Makefile
new file mode 100644
index 00000000..3351c87c
--- /dev/null
+++ b/src/spdk/lib/nvme/Makefile
@@ -0,0 +1,61 @@
+#
+# BSD LICENSE
+#
+# Copyright (c) Intel Corporation.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..)
+include $(SPDK_ROOT_DIR)/mk/spdk.common.mk
+
+C_SRCS = nvme_ctrlr_cmd.c nvme_ctrlr.c nvme_fabric.c nvme_ns_cmd.c nvme_ns.c nvme_pcie.c nvme_qpair.c nvme.c nvme_quirks.c nvme_transport.c nvme_uevent.c nvme_ctrlr_ocssd_cmd.c \
+ nvme_ns_ocssd_cmd.c
+C_SRCS-$(CONFIG_RDMA) += nvme_rdma.c
+LIBNAME = nvme
+LOCAL_SYS_LIBS = -luuid
+ifeq ($(CONFIG_RDMA),y)
+LOCAL_SYS_LIBS += -libverbs -lrdmacm
+#Attach only if FreeBSD and RDMA is specified with configure
+ifeq ($(OS),FreeBSD)
+# Mellanox - MLX4 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx4.*)","")
+LOCAL_SYS_LIBS += -lmlx4
+endif
+# Mellanox - MLX5 HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libmlx5.*)","")
+LOCAL_SYS_LIBS += -lmlx5
+endif
+# Chelsio HBA Userspace Library
+ifneq ("$(wildcard /usr/lib/libcxgb4.*)","")
+LOCAL_SYS_LIBS += -lcxgb4
+endif
+endif
+endif
+
+include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk
diff --git a/src/spdk/lib/nvme/nvme.c b/src/spdk/lib/nvme/nvme.c
new file mode 100644
index 00000000..dc657966
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme.c
@@ -0,0 +1,862 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvmf_spec.h"
+#include "nvme_internal.h"
+
+#define SPDK_NVME_DRIVER_NAME "spdk_nvme_driver"
+
+struct nvme_driver *g_spdk_nvme_driver;
+pid_t g_spdk_nvme_pid;
+
+int32_t spdk_nvme_retry_count;
+
+/* gross timeout of 180 seconds in milliseconds */
+static int g_nvme_driver_timeout_ms = 3 * 60 * 1000;
+
+static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_init_ctrlrs =
+ TAILQ_HEAD_INITIALIZER(g_nvme_init_ctrlrs);
+
+/* Per-process attached controller list */
+static TAILQ_HEAD(, spdk_nvme_ctrlr) g_nvme_attached_ctrlrs =
+ TAILQ_HEAD_INITIALIZER(g_nvme_attached_ctrlrs);
+
+/* Returns true if ctrlr should be stored on the multi-process shared_attached_ctrlrs list */
+static bool
+nvme_ctrlr_shared(const struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE;
+}
+
+/* Caller must hold g_spdk_nvme_driver->lock */
+void
+nvme_ctrlr_connected(struct spdk_nvme_ctrlr *ctrlr)
+{
+ TAILQ_INSERT_TAIL(&g_nvme_init_ctrlrs, ctrlr, tailq);
+}
+
+int
+spdk_nvme_detach(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ nvme_ctrlr_proc_put_ref(ctrlr);
+
+ if (nvme_ctrlr_get_ref_count(ctrlr) == 0) {
+ if (nvme_ctrlr_shared(ctrlr)) {
+ TAILQ_REMOVE(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+ } else {
+ TAILQ_REMOVE(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+ }
+ nvme_ctrlr_destruct(ctrlr);
+ }
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ return 0;
+}
+
+void
+nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_completion_poll_status *status = arg;
+
+ /*
+ * Copy status into the argument passed by the caller, so that
+ * the caller can check the status to determine if the
+ * the request passed or failed.
+ */
+ memcpy(&status->cpl, cpl, sizeof(*cpl));
+ status->done = true;
+}
+
+/**
+ * Poll qpair for completions until a command completes.
+ *
+ * \param qpair queue to poll
+ * \param status completion status
+ * \param robust_mutex optional robust mutex to lock while polling qpair
+ *
+ * \return 0 if command completed without error, negative errno on failure
+ *
+ * The command to wait upon must be submitted with nvme_completion_poll_cb as the callback
+ * and status as the callback argument.
+ */
+int
+spdk_nvme_wait_for_completion_robust_lock(
+ struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ pthread_mutex_t *robust_mutex)
+{
+ memset(&status->cpl, 0, sizeof(status->cpl));
+ status->done = false;
+
+ while (status->done == false) {
+ if (robust_mutex) {
+ nvme_robust_mutex_lock(robust_mutex);
+ }
+
+ spdk_nvme_qpair_process_completions(qpair, 0);
+
+ if (robust_mutex) {
+ nvme_robust_mutex_unlock(robust_mutex);
+ }
+ }
+
+ return spdk_nvme_cpl_is_error(&status->cpl) ? -EIO : 0;
+}
+
+int
+spdk_nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status)
+{
+ return spdk_nvme_wait_for_completion_robust_lock(qpair, status, NULL);
+}
+
+static void
+nvme_user_copy_cmd_complete(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *req = arg;
+ enum spdk_nvme_data_transfer xfer;
+
+ if (req->user_buffer && req->payload_size) {
+ /* Copy back to the user buffer and free the contig buffer */
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+ xfer = spdk_nvme_opc_get_data_transfer(req->cmd.opc);
+ if (xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST ||
+ xfer == SPDK_NVME_DATA_BIDIRECTIONAL) {
+ assert(req->pid == getpid());
+ memcpy(req->user_buffer, req->payload.contig_or_cb_arg, req->payload_size);
+ }
+
+ spdk_dma_free(req->payload.contig_or_cb_arg);
+ }
+
+ /* Call the user's original callback now that the buffer has been copied */
+ req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+/**
+ * Allocate a request as well as a DMA-capable buffer to copy to/from the user's buffer.
+ *
+ * This is intended for use in non-fast-path functions (admin commands, reservations, etc.)
+ * where the overhead of a copy is not a problem.
+ */
+struct nvme_request *
+nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, bool host_to_controller)
+{
+ struct nvme_request *req;
+ void *dma_buffer = NULL;
+ uint64_t phys_addr;
+
+ if (buffer && payload_size) {
+ dma_buffer = spdk_zmalloc(payload_size, 4096, &phys_addr,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (!dma_buffer) {
+ return NULL;
+ }
+
+ if (host_to_controller) {
+ memcpy(dma_buffer, buffer, payload_size);
+ }
+ }
+
+ req = nvme_allocate_request_contig(qpair, dma_buffer, payload_size, nvme_user_copy_cmd_complete,
+ NULL);
+ if (!req) {
+ spdk_free(dma_buffer);
+ return NULL;
+ }
+
+ req->user_cb_fn = cb_fn;
+ req->user_cb_arg = cb_arg;
+ req->user_buffer = buffer;
+ req->cb_arg = req;
+
+ return req;
+}
+
+/**
+ * Check if a request has exceeded the controller timeout.
+ *
+ * \param req request to check for timeout.
+ * \param cid command ID for command submitted by req (will be passed to timeout_cb_fn)
+ * \param active_proc per-process data for the controller associated with req
+ * \param now_tick current time from spdk_get_ticks()
+ * \return 0 if requests submitted more recently than req should still be checked for timeouts, or
+ * 1 if requests newer than req need not be checked.
+ *
+ * The request's timeout callback will be called if needed; the caller is only responsible for
+ * calling this function on each outstanding request.
+ */
+int
+nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+ struct spdk_nvme_ctrlr_process *active_proc,
+ uint64_t now_tick)
+{
+ struct spdk_nvme_qpair *qpair = req->qpair;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ assert(active_proc->timeout_cb_fn != NULL);
+
+ if (req->timed_out || req->submit_tick == 0) {
+ return 0;
+ }
+
+ if (req->pid != g_spdk_nvme_pid) {
+ return 0;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair) &&
+ req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ return 0;
+ }
+
+ if (req->submit_tick + active_proc->timeout_ticks > now_tick) {
+ return 1;
+ }
+
+ req->timed_out = true;
+
+ /*
+ * We don't want to expose the admin queue to the user,
+ * so when we're timing out admin commands set the
+ * qpair to NULL.
+ */
+ active_proc->timeout_cb_fn(active_proc->timeout_cb_arg, ctrlr,
+ nvme_qpair_is_admin_queue(qpair) ? NULL : qpair,
+ cid);
+ return 0;
+}
+
+int
+nvme_robust_mutex_init_shared(pthread_mutex_t *mtx)
+{
+ int rc = 0;
+
+#ifdef __FreeBSD__
+ pthread_mutex_init(mtx, NULL);
+#else
+ pthread_mutexattr_t attr;
+
+ if (pthread_mutexattr_init(&attr)) {
+ return -1;
+ }
+ if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+ pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+ pthread_mutex_init(mtx, &attr)) {
+ rc = -1;
+ }
+ pthread_mutexattr_destroy(&attr);
+#endif
+
+ return rc;
+}
+
+int
+nvme_driver_init(void)
+{
+ int ret = 0;
+ /* Any socket ID */
+ int socket_id = -1;
+
+ /* Each process needs its own pid. */
+ g_spdk_nvme_pid = getpid();
+
+ /*
+ * Only one thread from one process will do this driver init work.
+ * The primary process will reserve the shared memory and do the
+ * initialization.
+ * The secondary process will lookup the existing reserved memory.
+ */
+ if (spdk_process_is_primary()) {
+ /* The unique named memzone already reserved. */
+ if (g_spdk_nvme_driver != NULL) {
+ return 0;
+ } else {
+ g_spdk_nvme_driver = spdk_memzone_reserve(SPDK_NVME_DRIVER_NAME,
+ sizeof(struct nvme_driver), socket_id,
+ SPDK_MEMZONE_NO_IOVA_CONTIG);
+ }
+
+ if (g_spdk_nvme_driver == NULL) {
+ SPDK_ERRLOG("primary process failed to reserve memory\n");
+
+ return -1;
+ }
+ } else {
+ g_spdk_nvme_driver = spdk_memzone_lookup(SPDK_NVME_DRIVER_NAME);
+
+ /* The unique named memzone already reserved by the primary process. */
+ if (g_spdk_nvme_driver != NULL) {
+ int ms_waited = 0;
+
+ /* Wait the nvme driver to get initialized. */
+ while ((g_spdk_nvme_driver->initialized == false) &&
+ (ms_waited < g_nvme_driver_timeout_ms)) {
+ ms_waited++;
+ nvme_delay(1000); /* delay 1ms */
+ }
+ if (g_spdk_nvme_driver->initialized == false) {
+ SPDK_ERRLOG("timeout waiting for primary process to init\n");
+
+ return -1;
+ }
+ } else {
+ SPDK_ERRLOG("primary process is not started yet\n");
+
+ return -1;
+ }
+
+ return 0;
+ }
+
+ /*
+ * At this moment, only one thread from the primary process will do
+ * the g_spdk_nvme_driver initialization
+ */
+ assert(spdk_process_is_primary());
+
+ ret = nvme_robust_mutex_init_shared(&g_spdk_nvme_driver->lock);
+ if (ret != 0) {
+ SPDK_ERRLOG("failed to initialize mutex\n");
+ spdk_memzone_free(SPDK_NVME_DRIVER_NAME);
+ return ret;
+ }
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ g_spdk_nvme_driver->initialized = false;
+
+ TAILQ_INIT(&g_spdk_nvme_driver->shared_attached_ctrlrs);
+
+ spdk_uuid_generate(&g_spdk_nvme_driver->default_extended_host_id);
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ return ret;
+}
+
+int
+nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, void *devhandle,
+ spdk_nvme_probe_cb probe_cb, void *cb_ctx)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_nvme_ctrlr_opts opts;
+
+ assert(trid != NULL);
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&opts, sizeof(opts));
+
+ if (!probe_cb || probe_cb(cb_ctx, trid, &opts)) {
+ ctrlr = nvme_transport_ctrlr_construct(trid, &opts, devhandle);
+ if (ctrlr == NULL) {
+ SPDK_ERRLOG("Failed to construct NVMe controller for SSD: %s\n", trid->traddr);
+ return -1;
+ }
+
+ TAILQ_INSERT_TAIL(&g_nvme_init_ctrlrs, ctrlr, tailq);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int
+nvme_init_controllers(void *cb_ctx, spdk_nvme_attach_cb attach_cb)
+{
+ int rc = 0;
+ int start_rc;
+ struct spdk_nvme_ctrlr *ctrlr, *ctrlr_tmp;
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ /* Initialize all new controllers in the g_nvme_init_ctrlrs list in parallel. */
+ while (!TAILQ_EMPTY(&g_nvme_init_ctrlrs)) {
+ TAILQ_FOREACH_SAFE(ctrlr, &g_nvme_init_ctrlrs, tailq, ctrlr_tmp) {
+ /* Drop the driver lock while calling nvme_ctrlr_process_init()
+ * since it needs to acquire the driver lock internally when initializing
+ * controller.
+ *
+ * TODO: Rethink the locking - maybe reset should take the lock so that start() and
+ * the functions it calls (in particular nvme_ctrlr_set_num_qpairs())
+ * can assume it is held.
+ */
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ start_rc = nvme_ctrlr_process_init(ctrlr);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ if (start_rc) {
+ /* Controller failed to initialize. */
+ TAILQ_REMOVE(&g_nvme_init_ctrlrs, ctrlr, tailq);
+ SPDK_ERRLOG("Failed to initialize SSD: %s\n", ctrlr->trid.traddr);
+ nvme_ctrlr_destruct(ctrlr);
+ rc = -1;
+ break;
+ }
+
+ if (ctrlr->state == NVME_CTRLR_STATE_READY) {
+ /*
+ * Controller has been initialized.
+ * Move it to the attached_ctrlrs list.
+ */
+ TAILQ_REMOVE(&g_nvme_init_ctrlrs, ctrlr, tailq);
+ if (nvme_ctrlr_shared(ctrlr)) {
+ TAILQ_INSERT_TAIL(&g_spdk_nvme_driver->shared_attached_ctrlrs, ctrlr, tailq);
+ } else {
+ TAILQ_INSERT_TAIL(&g_nvme_attached_ctrlrs, ctrlr, tailq);
+ }
+
+ /*
+ * Increase the ref count before calling attach_cb() as the user may
+ * call nvme_detach() immediately.
+ */
+ nvme_ctrlr_proc_get_ref(ctrlr);
+
+ /*
+ * Unlock while calling attach_cb() so the user can call other functions
+ * that may take the driver lock, like nvme_detach().
+ */
+ if (attach_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ attach_cb(cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+
+ break;
+ }
+ }
+ }
+
+ g_spdk_nvme_driver->initialized = true;
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ return rc;
+}
+
+/* This function must not be called while holding g_spdk_nvme_driver->lock */
+static struct spdk_nvme_ctrlr *
+spdk_nvme_get_ctrlr_by_trid(const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(trid);
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ return ctrlr;
+}
+
+/* This function must be called while holding g_spdk_nvme_driver->lock */
+struct spdk_nvme_ctrlr *
+spdk_nvme_get_ctrlr_by_trid_unsafe(const struct spdk_nvme_transport_id *trid)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ /* Search per-process list */
+ TAILQ_FOREACH(ctrlr, &g_nvme_attached_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+ return ctrlr;
+ }
+ }
+
+ /* Search multi-process shared list */
+ TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+ if (spdk_nvme_transport_id_compare(&ctrlr->trid, trid) == 0) {
+ return ctrlr;
+ }
+ }
+
+ return NULL;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+spdk_nvme_probe_internal(const struct spdk_nvme_transport_id *trid, void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb,
+ spdk_nvme_remove_cb remove_cb, struct spdk_nvme_ctrlr **connected_ctrlr)
+{
+ int rc;
+ struct spdk_nvme_ctrlr *ctrlr;
+ bool direct_connect = (connected_ctrlr != NULL);
+
+ if (!spdk_nvme_transport_available(trid->trtype)) {
+ SPDK_ERRLOG("NVMe trtype %u not available\n", trid->trtype);
+ return -1;
+ }
+
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+
+ nvme_transport_ctrlr_scan(trid, cb_ctx, probe_cb, remove_cb, direct_connect);
+
+ /*
+ * Probe controllers on the shared_attached_ctrlrs list
+ */
+ if (!spdk_process_is_primary() && (trid->trtype == SPDK_NVME_TRANSPORT_PCIE)) {
+ TAILQ_FOREACH(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq) {
+ /* Do not attach other ctrlrs if user specify a valid trid */
+ if ((strlen(trid->traddr) != 0) &&
+ (spdk_nvme_transport_id_compare(trid, &ctrlr->trid))) {
+ continue;
+ }
+
+ nvme_ctrlr_proc_get_ref(ctrlr);
+
+ /*
+ * Unlock while calling attach_cb() so the user can call other functions
+ * that may take the driver lock, like nvme_detach().
+ */
+ if (attach_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ attach_cb(cb_ctx, &ctrlr->trid, ctrlr, &ctrlr->opts);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+
+ rc = 0;
+
+ goto exit;
+ }
+
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ /*
+ * Keep going even if one or more nvme_attach() calls failed,
+ * but maintain the value of rc to signal errors when we return.
+ */
+
+ rc = nvme_init_controllers(cb_ctx, attach_cb);
+
+exit:
+ if (connected_ctrlr) {
+ *connected_ctrlr = spdk_nvme_get_ctrlr_by_trid(trid);
+ }
+
+ return rc;
+}
+
+int
+spdk_nvme_probe(const struct spdk_nvme_transport_id *trid, void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb, spdk_nvme_attach_cb attach_cb,
+ spdk_nvme_remove_cb remove_cb)
+{
+ int rc;
+ struct spdk_nvme_transport_id trid_pcie;
+
+ rc = nvme_driver_init();
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (trid == NULL) {
+ memset(&trid_pcie, 0, sizeof(trid_pcie));
+ trid_pcie.trtype = SPDK_NVME_TRANSPORT_PCIE;
+ trid = &trid_pcie;
+ }
+
+ return spdk_nvme_probe_internal(trid, cb_ctx, probe_cb, attach_cb, remove_cb, NULL);
+}
+
+static bool
+spdk_nvme_connect_probe_cb(void *cb_ctx, const struct spdk_nvme_transport_id *trid,
+ struct spdk_nvme_ctrlr_opts *opts)
+{
+ struct spdk_nvme_ctrlr_connect_opts *requested_opts = cb_ctx;
+
+ assert(requested_opts->opts);
+
+ assert(requested_opts->opts_size != 0);
+
+ memcpy(opts, requested_opts->opts, spdk_min(sizeof(*opts), requested_opts->opts_size));
+
+ return true;
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_connect(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+ int rc;
+ struct spdk_nvme_ctrlr_connect_opts connect_opts = {};
+ struct spdk_nvme_ctrlr_connect_opts *user_connect_opts = NULL;
+ struct spdk_nvme_ctrlr *ctrlr = NULL;
+ spdk_nvme_probe_cb probe_cb = NULL;
+
+ if (trid == NULL) {
+ SPDK_ERRLOG("No transport ID specified\n");
+ return NULL;
+ }
+
+ rc = nvme_driver_init();
+ if (rc != 0) {
+ return NULL;
+ }
+
+ if (opts && opts_size > 0) {
+ connect_opts.opts = opts;
+ connect_opts.opts_size = opts_size;
+ user_connect_opts = &connect_opts;
+ probe_cb = spdk_nvme_connect_probe_cb;
+ }
+
+ spdk_nvme_probe_internal(trid, user_connect_opts, probe_cb, NULL, NULL, &ctrlr);
+
+ return ctrlr;
+}
+
+int
+spdk_nvme_transport_id_parse_trtype(enum spdk_nvme_transport_type *trtype, const char *str)
+{
+ if (trtype == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ if (strcasecmp(str, "PCIe") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_PCIE;
+ } else if (strcasecmp(str, "RDMA") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_RDMA;
+ } else if (strcasecmp(str, "FC") == 0) {
+ *trtype = SPDK_NVME_TRANSPORT_FC;
+ } else {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+const char *
+spdk_nvme_transport_id_trtype_str(enum spdk_nvme_transport_type trtype)
+{
+ switch (trtype) {
+ case SPDK_NVME_TRANSPORT_PCIE:
+ return "PCIe";
+ case SPDK_NVME_TRANSPORT_RDMA:
+ return "RDMA";
+ case SPDK_NVME_TRANSPORT_FC:
+ return "FC";
+ default:
+ return NULL;
+ }
+}
+
+int
+spdk_nvme_transport_id_parse_adrfam(enum spdk_nvmf_adrfam *adrfam, const char *str)
+{
+ if (adrfam == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ if (strcasecmp(str, "IPv4") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IPV4;
+ } else if (strcasecmp(str, "IPv6") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IPV6;
+ } else if (strcasecmp(str, "IB") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_IB;
+ } else if (strcasecmp(str, "FC") == 0) {
+ *adrfam = SPDK_NVMF_ADRFAM_FC;
+ } else {
+ return -ENOENT;
+ }
+ return 0;
+}
+
+const char *
+spdk_nvme_transport_id_adrfam_str(enum spdk_nvmf_adrfam adrfam)
+{
+ switch (adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ return "IPv4";
+ case SPDK_NVMF_ADRFAM_IPV6:
+ return "IPv6";
+ case SPDK_NVMF_ADRFAM_IB:
+ return "IB";
+ case SPDK_NVMF_ADRFAM_FC:
+ return "FC";
+ default:
+ return NULL;
+ }
+}
+
+int
+spdk_nvme_transport_id_parse(struct spdk_nvme_transport_id *trid, const char *str)
+{
+ const char *sep, *sep1;
+ const char *whitespace = " \t\n";
+ size_t key_len, val_len;
+ char key[32];
+ char val[1024];
+
+ if (trid == NULL || str == NULL) {
+ return -EINVAL;
+ }
+
+ while (*str != '\0') {
+ str += strspn(str, whitespace);
+
+ sep = strchr(str, ':');
+ if (!sep) {
+ sep = strchr(str, '=');
+ if (!sep) {
+ SPDK_ERRLOG("Key without ':' or '=' separator\n");
+ return -EINVAL;
+ }
+ } else {
+ sep1 = strchr(str, '=');
+ if ((sep1 != NULL) && (sep1 < sep)) {
+ sep = sep1;
+ }
+ }
+
+ key_len = sep - str;
+ if (key_len >= sizeof(key)) {
+ SPDK_ERRLOG("Transport key length %zu greater than maximum allowed %zu\n",
+ key_len, sizeof(key) - 1);
+ return -EINVAL;
+ }
+
+ memcpy(key, str, key_len);
+ key[key_len] = '\0';
+
+ str += key_len + 1; /* Skip key: */
+ val_len = strcspn(str, whitespace);
+ if (val_len == 0) {
+ SPDK_ERRLOG("Key without value\n");
+ return -EINVAL;
+ }
+
+ if (val_len >= sizeof(val)) {
+ SPDK_ERRLOG("Transport value length %zu greater than maximum allowed %zu\n",
+ val_len, sizeof(val) - 1);
+ return -EINVAL;
+ }
+
+ memcpy(val, str, val_len);
+ val[val_len] = '\0';
+
+ str += val_len;
+
+ if (strcasecmp(key, "trtype") == 0) {
+ if (spdk_nvme_transport_id_parse_trtype(&trid->trtype, val) != 0) {
+ SPDK_ERRLOG("Unknown trtype '%s'\n", val);
+ return -EINVAL;
+ }
+ } else if (strcasecmp(key, "adrfam") == 0) {
+ if (spdk_nvme_transport_id_parse_adrfam(&trid->adrfam, val) != 0) {
+ SPDK_ERRLOG("Unknown adrfam '%s'\n", val);
+ return -EINVAL;
+ }
+ } else if (strcasecmp(key, "traddr") == 0) {
+ if (val_len > SPDK_NVMF_TRADDR_MAX_LEN) {
+ SPDK_ERRLOG("traddr length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRADDR_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->traddr, val, val_len + 1);
+ } else if (strcasecmp(key, "trsvcid") == 0) {
+ if (val_len > SPDK_NVMF_TRSVCID_MAX_LEN) {
+ SPDK_ERRLOG("trsvcid length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_TRSVCID_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->trsvcid, val, val_len + 1);
+ } else if (strcasecmp(key, "subnqn") == 0) {
+ if (val_len > SPDK_NVMF_NQN_MAX_LEN) {
+ SPDK_ERRLOG("subnqn length %zu greater than maximum allowed %u\n",
+ val_len, SPDK_NVMF_NQN_MAX_LEN);
+ return -EINVAL;
+ }
+ memcpy(trid->subnqn, val, val_len + 1);
+ } else {
+ SPDK_ERRLOG("Unknown transport ID key '%s'\n", key);
+ }
+ }
+
+ return 0;
+}
+
+static int
+cmp_int(int a, int b)
+{
+ return a - b;
+}
+
+int
+spdk_nvme_transport_id_compare(const struct spdk_nvme_transport_id *trid1,
+ const struct spdk_nvme_transport_id *trid2)
+{
+ int cmp;
+
+ cmp = cmp_int(trid1->trtype, trid2->trtype);
+ if (cmp) {
+ return cmp;
+ }
+
+ if (trid1->trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ struct spdk_pci_addr pci_addr1;
+ struct spdk_pci_addr pci_addr2;
+
+ /* Normalize PCI addresses before comparing */
+ if (spdk_pci_addr_parse(&pci_addr1, trid1->traddr) < 0 ||
+ spdk_pci_addr_parse(&pci_addr2, trid2->traddr) < 0) {
+ return -1;
+ }
+
+ /* PCIe transport ID only uses trtype and traddr */
+ return spdk_pci_addr_compare(&pci_addr1, &pci_addr2);
+ }
+
+ cmp = strcasecmp(trid1->traddr, trid2->traddr);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = cmp_int(trid1->adrfam, trid2->adrfam);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = strcasecmp(trid1->trsvcid, trid2->trsvcid);
+ if (cmp) {
+ return cmp;
+ }
+
+ cmp = strcmp(trid1->subnqn, trid2->subnqn);
+ if (cmp) {
+ return cmp;
+ }
+
+ return 0;
+}
+
+SPDK_LOG_REGISTER_COMPONENT("nvme", SPDK_LOG_NVME)
diff --git a/src/spdk/lib/nvme/nvme_ctrlr.c b/src/spdk/lib/nvme/nvme_ctrlr.c
new file mode 100644
index 00000000..69ae0878
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr.c
@@ -0,0 +1,2678 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+
+#include "nvme_internal.h"
+
+#include "spdk/env.h"
+#include "spdk/string.h"
+
+static int nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_async_event_request *aer);
+static int nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns);
+static int nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns);
+
+static int
+nvme_ctrlr_get_cc(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cc_register *cc)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ &cc->raw);
+}
+
+static int
+nvme_ctrlr_get_csts(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_csts_register *csts)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, csts.raw),
+ &csts->raw);
+}
+
+int
+nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap)
+{
+ return nvme_transport_ctrlr_get_reg_8(ctrlr, offsetof(struct spdk_nvme_registers, cap.raw),
+ &cap->raw);
+}
+
+int
+nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs)
+{
+ return nvme_transport_ctrlr_get_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, vs.raw),
+ &vs->raw);
+}
+
+static int
+nvme_ctrlr_set_cc(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cc_register *cc)
+{
+ return nvme_transport_ctrlr_set_reg_4(ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ cc->raw);
+}
+
+void
+spdk_nvme_ctrlr_get_default_ctrlr_opts(struct spdk_nvme_ctrlr_opts *opts, size_t opts_size)
+{
+ char host_id_str[SPDK_UUID_STRING_LEN];
+
+ assert(opts);
+
+ memset(opts, 0, opts_size);
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_nvme_ctrlr_opts, field) + sizeof(opts->field) <= opts_size
+
+ if (FIELD_OK(num_io_queues)) {
+ opts->num_io_queues = DEFAULT_MAX_IO_QUEUES;
+ }
+
+ if (FIELD_OK(use_cmb_sqs)) {
+ opts->use_cmb_sqs = true;
+ }
+
+ if (FIELD_OK(arb_mechanism)) {
+ opts->arb_mechanism = SPDK_NVME_CC_AMS_RR;
+ }
+
+ if (FIELD_OK(keep_alive_timeout_ms)) {
+ opts->keep_alive_timeout_ms = 10 * 1000;
+ }
+
+ if (FIELD_OK(io_queue_size)) {
+ opts->io_queue_size = DEFAULT_IO_QUEUE_SIZE;
+ }
+
+ if (FIELD_OK(io_queue_requests)) {
+ opts->io_queue_requests = DEFAULT_IO_QUEUE_REQUESTS;
+ }
+
+ if (FIELD_OK(host_id)) {
+ memset(opts->host_id, 0, sizeof(opts->host_id));
+ }
+
+ if (nvme_driver_init() == 0) {
+ if (FIELD_OK(extended_host_id)) {
+ memcpy(opts->extended_host_id, &g_spdk_nvme_driver->default_extended_host_id,
+ sizeof(opts->extended_host_id));
+ }
+
+ if (FIELD_OK(hostnqn)) {
+ spdk_uuid_fmt_lower(host_id_str, sizeof(host_id_str),
+ &g_spdk_nvme_driver->default_extended_host_id);
+ snprintf(opts->hostnqn, sizeof(opts->hostnqn), "2014-08.org.nvmexpress:uuid:%s", host_id_str);
+ }
+ }
+
+ if (FIELD_OK(src_addr)) {
+ memset(opts->src_addr, 0, sizeof(opts->src_addr));
+ }
+
+ if (FIELD_OK(src_svcid)) {
+ memset(opts->src_svcid, 0, sizeof(opts->src_svcid));
+ }
+
+ if (FIELD_OK(command_set)) {
+ opts->command_set = SPDK_NVME_CC_CSS_NVM;
+ }
+#undef FIELD_OK
+}
+
+/**
+ * This function will be called when the process allocates the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_add_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ TAILQ_INSERT_TAIL(&active_proc->allocated_io_qpairs, qpair, per_process_tailq);
+ qpair->active_proc = active_proc;
+ }
+}
+
+/**
+ * This function will be called when the process frees the IO qpair.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_proc_remove_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_qpair *active_qpair, *tmp_qpair;
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (!active_proc) {
+ return;
+ }
+
+ TAILQ_FOREACH_SAFE(active_qpair, &active_proc->allocated_io_qpairs,
+ per_process_tailq, tmp_qpair) {
+ if (active_qpair == qpair) {
+ TAILQ_REMOVE(&active_proc->allocated_io_qpairs,
+ active_qpair, per_process_tailq);
+
+ break;
+ }
+ }
+}
+
+void
+spdk_nvme_ctrlr_get_default_io_qpair_opts(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_io_qpair_opts *opts,
+ size_t opts_size)
+{
+ assert(ctrlr);
+
+ assert(opts);
+
+ memset(opts, 0, opts_size);
+
+#define FIELD_OK(field) \
+ offsetof(struct spdk_nvme_io_qpair_opts, field) + sizeof(opts->field) <= opts_size
+
+ if (FIELD_OK(qprio)) {
+ opts->qprio = SPDK_NVME_QPRIO_URGENT;
+ }
+
+ if (FIELD_OK(io_queue_size)) {
+ opts->io_queue_size = ctrlr->opts.io_queue_size;
+ }
+
+ if (FIELD_OK(io_queue_requests)) {
+ opts->io_queue_requests = ctrlr->opts.io_queue_requests;
+ }
+
+#undef FIELD_OK
+}
+
+struct spdk_nvme_qpair *
+spdk_nvme_ctrlr_alloc_io_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_io_qpair_opts *user_opts,
+ size_t opts_size)
+{
+ uint32_t qid;
+ struct spdk_nvme_qpair *qpair;
+ union spdk_nvme_cc_register cc;
+ struct spdk_nvme_io_qpair_opts opts;
+
+ if (!ctrlr) {
+ return NULL;
+ }
+
+ /*
+ * Get the default options, then overwrite them with the user-provided options
+ * up to opts_size.
+ *
+ * This allows for extensions of the opts structure without breaking
+ * ABI compatibility.
+ */
+ spdk_nvme_ctrlr_get_default_io_qpair_opts(ctrlr, &opts, sizeof(opts));
+ if (user_opts) {
+ memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size));
+ }
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc failed\n");
+ return NULL;
+ }
+
+ /* Only the low 2 bits (values 0, 1, 2, 3) of QPRIO are valid. */
+ if ((opts.qprio & 3) != opts.qprio) {
+ return NULL;
+ }
+
+ /*
+ * Only value SPDK_NVME_QPRIO_URGENT(0) is valid for the
+ * default round robin arbitration method.
+ */
+ if ((cc.bits.ams == SPDK_NVME_CC_AMS_RR) && (opts.qprio != SPDK_NVME_QPRIO_URGENT)) {
+ SPDK_ERRLOG("invalid queue priority for default round robin arbitration method\n");
+ return NULL;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ /*
+ * Get the first available I/O queue ID.
+ */
+ qid = spdk_bit_array_find_first_set(ctrlr->free_io_qids, 1);
+ if (qid > ctrlr->opts.num_io_queues) {
+ SPDK_ERRLOG("No free I/O queue IDs\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+
+ qpair = nvme_transport_ctrlr_create_io_qpair(ctrlr, qid, &opts);
+ if (qpair == NULL) {
+ SPDK_ERRLOG("nvme_transport_ctrlr_create_io_qpair() failed\n");
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return NULL;
+ }
+ spdk_bit_array_clear(ctrlr->free_io_qids, qid);
+ TAILQ_INSERT_TAIL(&ctrlr->active_io_qpairs, qpair, tailq);
+
+ nvme_ctrlr_proc_add_io_qpair(qpair);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->quirks & NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC) {
+ spdk_delay_us(100);
+ }
+
+ return qpair;
+}
+
+int
+spdk_nvme_ctrlr_free_io_qpair(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ if (qpair == NULL) {
+ return 0;
+ }
+
+ ctrlr = qpair->ctrlr;
+
+ if (qpair->in_completion_context) {
+ /*
+ * There are many cases where it is convenient to delete an io qpair in the context
+ * of that qpair's completion routine. To handle this properly, set a flag here
+ * so that the completion routine will perform an actual delete after the context
+ * unwinds.
+ */
+ qpair->delete_after_completion_context = 1;
+ return 0;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_proc_remove_io_qpair(qpair);
+
+ TAILQ_REMOVE(&ctrlr->active_io_qpairs, qpair, tailq);
+ spdk_bit_array_set(ctrlr->free_io_qids, qpair->id);
+
+ if (nvme_transport_ctrlr_delete_io_qpair(ctrlr, qpair)) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -1;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return 0;
+}
+
+static void
+nvme_ctrlr_construct_intel_support_log_page_list(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_intel_log_page_directory *log_page_directory)
+{
+ if (log_page_directory == NULL) {
+ return;
+ }
+
+ if (ctrlr->cdata.vid != SPDK_PCI_VID_INTEL) {
+ return;
+ }
+
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY] = true;
+
+ if (log_page_directory->read_latency_log_len ||
+ (ctrlr->quirks & NVME_INTEL_QUIRK_READ_LATENCY)) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_READ_CMD_LATENCY] = true;
+ }
+ if (log_page_directory->write_latency_log_len ||
+ (ctrlr->quirks & NVME_INTEL_QUIRK_WRITE_LATENCY)) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_WRITE_CMD_LATENCY] = true;
+ }
+ if (log_page_directory->temperature_statistics_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_TEMPERATURE] = true;
+ }
+ if (log_page_directory->smart_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_LOG_SMART] = true;
+ }
+ if (log_page_directory->marketing_description_log_len) {
+ ctrlr->log_page_supported[SPDK_NVME_INTEL_MARKETING_DESCRIPTION] = true;
+ }
+}
+
+static int nvme_ctrlr_set_intel_support_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ uint64_t phys_addr = 0;
+ struct nvme_completion_poll_status status;
+ struct spdk_nvme_intel_log_page_directory *log_page_directory;
+
+ log_page_directory = spdk_zmalloc(sizeof(struct spdk_nvme_intel_log_page_directory),
+ 64, &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_DMA);
+ if (log_page_directory == NULL) {
+ SPDK_ERRLOG("could not allocate log_page_directory\n");
+ return -ENXIO;
+ }
+
+ rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_INTEL_LOG_PAGE_DIRECTORY,
+ SPDK_NVME_GLOBAL_NS_TAG, log_page_directory,
+ sizeof(struct spdk_nvme_intel_log_page_directory),
+ 0, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ spdk_free(log_page_directory);
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ spdk_free(log_page_directory);
+ SPDK_ERRLOG("nvme_ctrlr_cmd_get_log_page failed!\n");
+ return -ENXIO;
+ }
+
+ nvme_ctrlr_construct_intel_support_log_page_list(ctrlr, log_page_directory);
+ spdk_free(log_page_directory);
+ return 0;
+}
+
+static int
+nvme_ctrlr_set_supported_log_pages(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+
+ memset(ctrlr->log_page_supported, 0, sizeof(ctrlr->log_page_supported));
+ /* Mandatory pages */
+ ctrlr->log_page_supported[SPDK_NVME_LOG_ERROR] = true;
+ ctrlr->log_page_supported[SPDK_NVME_LOG_HEALTH_INFORMATION] = true;
+ ctrlr->log_page_supported[SPDK_NVME_LOG_FIRMWARE_SLOT] = true;
+ if (ctrlr->cdata.lpa.celp) {
+ ctrlr->log_page_supported[SPDK_NVME_LOG_COMMAND_EFFECTS_LOG] = true;
+ }
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL && !(ctrlr->quirks & NVME_INTEL_QUIRK_NO_LOG_PAGES)) {
+ rc = nvme_ctrlr_set_intel_support_log_pages(ctrlr);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_set_intel_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_MAX_LBA] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_NATIVE_MAX_LBA] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_POWER_GOVERNOR_SETTING] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_SMBUS_ADDRESS] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LED_PATTERN] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_RESET_TIMED_WORKLOAD_COUNTERS] = true;
+ ctrlr->feature_supported[SPDK_NVME_INTEL_FEAT_LATENCY_TRACKING] = true;
+}
+
+static void
+nvme_ctrlr_set_supported_features(struct spdk_nvme_ctrlr *ctrlr)
+{
+ memset(ctrlr->feature_supported, 0, sizeof(ctrlr->feature_supported));
+ /* Mandatory features */
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ARBITRATION] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_POWER_MANAGEMENT] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ERROR_RECOVERY] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_NUMBER_OF_QUEUES] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_COALESCING] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_WRITE_ATOMICITY] = true;
+ ctrlr->feature_supported[SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION] = true;
+ /* Optional features */
+ if (ctrlr->cdata.vwc.present) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE] = true;
+ }
+ if (ctrlr->cdata.apsta.supported) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION] = true;
+ }
+ if (ctrlr->cdata.hmpre) {
+ ctrlr->feature_supported[SPDK_NVME_FEAT_HOST_MEM_BUFFER] = true;
+ }
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_INTEL) {
+ nvme_ctrlr_set_intel_supported_features(ctrlr);
+ }
+}
+
+void
+nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove)
+{
+ /*
+ * Set the flag here and leave the work failure of qpairs to
+ * spdk_nvme_qpair_process_completions().
+ */
+ if (hot_remove) {
+ ctrlr->is_removed = true;
+ }
+ ctrlr->is_failed = true;
+ SPDK_ERRLOG("ctrlr %s in failed state.\n", ctrlr->trid.traddr);
+}
+
+static void
+nvme_ctrlr_shutdown(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ uint32_t ms_waited = 0;
+ uint32_t shutdown_timeout_ms;
+
+ if (ctrlr->is_removed) {
+ return;
+ }
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc() failed\n");
+ return;
+ }
+
+ cc.bits.shn = SPDK_NVME_SHN_NORMAL;
+
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return;
+ }
+
+ /*
+ * The NVMe specification defines RTD3E to be the time between
+ * setting SHN = 1 until the controller will set SHST = 10b.
+ * If the device doesn't report RTD3 entry latency, or if it
+ * reports RTD3 entry latency less than 10 seconds, pick
+ * 10 seconds as a reasonable amount of time to
+ * wait before proceeding.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RTD3E = %" PRIu32 " us\n", ctrlr->cdata.rtd3e);
+ shutdown_timeout_ms = (ctrlr->cdata.rtd3e + 999) / 1000;
+ shutdown_timeout_ms = spdk_max(shutdown_timeout_ms, 10000);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown timeout = %" PRIu32 " ms\n", shutdown_timeout_ms);
+
+ do {
+ if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ SPDK_ERRLOG("get_csts() failed\n");
+ return;
+ }
+
+ if (csts.bits.shst == SPDK_NVME_SHST_COMPLETE) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "shutdown complete in %u milliseconds\n",
+ ms_waited);
+ return;
+ }
+
+ nvme_delay(1000);
+ ms_waited++;
+ } while (ms_waited < shutdown_timeout_ms);
+
+ SPDK_ERRLOG("did not shutdown within %u milliseconds\n", shutdown_timeout_ms);
+}
+
+static int
+nvme_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ int rc;
+
+ rc = nvme_transport_ctrlr_enable(ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("transport ctrlr_enable failed\n");
+ return rc;
+ }
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("get_cc() failed\n");
+ return -EIO;
+ }
+
+ if (cc.bits.en != 0) {
+ SPDK_ERRLOG("%s called with CC.EN = 1\n", __func__);
+ return -EINVAL;
+ }
+
+ cc.bits.en = 1;
+ cc.bits.css = 0;
+ cc.bits.shn = 0;
+ cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+ cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+
+ /* Page size is 2 ^ (12 + mps). */
+ cc.bits.mps = spdk_u32log2(ctrlr->page_size) - 12;
+
+ if (ctrlr->cap.bits.css == 0) {
+ SPDK_INFOLOG(SPDK_LOG_NVME,
+ "Drive reports no command sets supported. Assuming NVM is supported.\n");
+ ctrlr->cap.bits.css = SPDK_NVME_CAP_CSS_NVM;
+ }
+
+ if (!(ctrlr->cap.bits.css & (1u << ctrlr->opts.command_set))) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested I/O command set %u but supported mask is 0x%x\n",
+ ctrlr->opts.command_set, ctrlr->cap.bits.css);
+ return -EINVAL;
+ }
+
+ cc.bits.css = ctrlr->opts.command_set;
+
+ switch (ctrlr->opts.arb_mechanism) {
+ case SPDK_NVME_CC_AMS_RR:
+ break;
+ case SPDK_NVME_CC_AMS_WRR:
+ if (SPDK_NVME_CAP_AMS_WRR & ctrlr->cap.bits.ams) {
+ break;
+ }
+ return -EINVAL;
+ case SPDK_NVME_CC_AMS_VS:
+ if (SPDK_NVME_CAP_AMS_VS & ctrlr->cap.bits.ams) {
+ break;
+ }
+ return -EINVAL;
+ default:
+ return -EINVAL;
+ }
+
+ cc.bits.ams = ctrlr->opts.arb_mechanism;
+
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+#ifdef DEBUG
+static const char *
+nvme_ctrlr_state_string(enum nvme_ctrlr_state state)
+{
+ switch (state) {
+ case NVME_CTRLR_STATE_INIT_DELAY:
+ return "delay init";
+ case NVME_CTRLR_STATE_INIT:
+ return "init";
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+ return "disable and wait for CSTS.RDY = 1";
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+ return "disable and wait for CSTS.RDY = 0";
+ case NVME_CTRLR_STATE_ENABLE:
+ return "enable controller by writing CC.EN = 1";
+ case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+ return "wait for CSTS.RDY = 1";
+ case NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE:
+ return "enable admin queue";
+ case NVME_CTRLR_STATE_IDENTIFY:
+ return "identify controller";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+ return "wait for identify controller";
+ case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+ return "set number of queues";
+ case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+ return "wait for set number of queues";
+ case NVME_CTRLR_STATE_GET_NUM_QUEUES:
+ return "get number of queues";
+ case NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES:
+ return "wait for get number of queues";
+ case NVME_CTRLR_STATE_CONSTRUCT_NS:
+ return "construct namespaces";
+ case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+ return "identify active ns";
+ case NVME_CTRLR_STATE_IDENTIFY_NS:
+ return "identify ns";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+ return "wait for identify ns";
+ case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+ return "identify namespace id descriptors";
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+ return "wait for identify namespace id descriptors";
+ case NVME_CTRLR_STATE_CONFIGURE_AER:
+ return "configure AER";
+ case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+ return "wait for configure aer";
+ case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+ return "set supported log pages";
+ case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+ return "set supported features";
+ case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+ return "set doorbell buffer config";
+ case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+ return "wait for doorbell buffer config";
+ case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+ return "set keep alive timeout";
+ case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+ return "wait for set keep alive timeout";
+ case NVME_CTRLR_STATE_SET_HOST_ID:
+ return "set host ID";
+ case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+ return "wait for set host ID";
+ case NVME_CTRLR_STATE_READY:
+ return "ready";
+ case NVME_CTRLR_STATE_ERROR:
+ return "error";
+ }
+ return "unknown";
+};
+#endif /* DEBUG */
+
+static void
+nvme_ctrlr_set_state(struct spdk_nvme_ctrlr *ctrlr, enum nvme_ctrlr_state state,
+ uint64_t timeout_in_ms)
+{
+ ctrlr->state = state;
+ if (timeout_in_ms == NVME_TIMEOUT_INFINITE) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (no timeout)\n",
+ nvme_ctrlr_state_string(ctrlr->state));
+ ctrlr->state_timeout_tsc = NVME_TIMEOUT_INFINITE;
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "setting state to %s (timeout %" PRIu64 " ms)\n",
+ nvme_ctrlr_state_string(ctrlr->state), timeout_in_ms);
+ ctrlr->state_timeout_tsc = spdk_get_ticks() + (timeout_in_ms * spdk_get_ticks_hz()) / 1000;
+ }
+}
+
+static void
+nvme_ctrlr_free_doorbell_buffer(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->shadow_doorbell) {
+ spdk_dma_free(ctrlr->shadow_doorbell);
+ ctrlr->shadow_doorbell = NULL;
+ }
+
+ if (ctrlr->eventidx) {
+ spdk_dma_free(ctrlr->eventidx);
+ ctrlr->eventidx = NULL;
+ }
+}
+
+static void
+nvme_ctrlr_set_doorbell_buffer_config_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_WARNLOG("Doorbell buffer config failed\n");
+ } else {
+ SPDK_INFOLOG(SPDK_LOG_NVME, "NVMe controller: %s doorbell buffer config enabled\n",
+ ctrlr->trid.traddr);
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ uint64_t prp1, prp2;
+
+ if (!ctrlr->cdata.oacs.doorbell_buffer_config) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ /* only 1 page size for doorbell buffer */
+ ctrlr->shadow_doorbell = spdk_dma_zmalloc(ctrlr->page_size, ctrlr->page_size,
+ &prp1);
+ if (ctrlr->shadow_doorbell == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ ctrlr->eventidx = spdk_dma_zmalloc(ctrlr->page_size, ctrlr->page_size, &prp2);
+ if (ctrlr->eventidx == NULL) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG, NVME_TIMEOUT_INFINITE);
+
+ rc = nvme_ctrlr_cmd_doorbell_buffer_config(ctrlr, prp1, prp2,
+ nvme_ctrlr_set_doorbell_buffer_config_done, ctrlr);
+ if (rc != 0) {
+ goto error;
+ }
+
+ return 0;
+
+error:
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_reset(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ struct spdk_nvme_qpair *qpair;
+ struct nvme_request *req, *tmp;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (ctrlr->is_resetting || ctrlr->is_failed) {
+ /*
+ * Controller is already resetting or has failed. Return
+ * immediately since there is no need to kick off another
+ * reset in these cases.
+ */
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return 0;
+ }
+
+ ctrlr->is_resetting = true;
+
+ SPDK_NOTICELOG("resetting controller\n");
+
+ /* Free all of the queued abort requests */
+ STAILQ_FOREACH_SAFE(req, &ctrlr->queued_aborts, stailq, tmp) {
+ STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+ nvme_free_request(req);
+ ctrlr->outstanding_aborts--;
+ }
+
+ /* Disable all queues before disabling the controller hardware. */
+ nvme_qpair_disable(ctrlr->adminq);
+ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+ nvme_qpair_disable(qpair);
+ }
+
+ /* Doorbell buffer config is invalid during reset */
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+ /* Set the state back to INIT to cause a full hardware reset. */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+
+ while (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ if (nvme_ctrlr_process_init(ctrlr) != 0) {
+ SPDK_ERRLOG("%s: controller reinitialization failed\n", __func__);
+ nvme_ctrlr_fail(ctrlr, false);
+ rc = -1;
+ break;
+ }
+ }
+
+ if (!ctrlr->is_failed) {
+ /* Reinitialize qpairs */
+ TAILQ_FOREACH(qpair, &ctrlr->active_io_qpairs, tailq) {
+ if (nvme_transport_ctrlr_reinit_io_qpair(ctrlr, qpair) != 0) {
+ nvme_ctrlr_fail(ctrlr, false);
+ rc = -1;
+ }
+ }
+ }
+
+ ctrlr->is_resetting = false;
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_identify_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("nvme_identify_controller failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /*
+ * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
+ * controller supports.
+ */
+ ctrlr->max_xfer_size = nvme_transport_ctrlr_get_max_xfer_size(ctrlr);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "transport max_xfer_size %u\n", ctrlr->max_xfer_size);
+ if (ctrlr->cdata.mdts > 0) {
+ ctrlr->max_xfer_size = spdk_min(ctrlr->max_xfer_size,
+ ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "MDTS max_xfer_size %u\n", ctrlr->max_xfer_size);
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cdata.cntlid);
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ ctrlr->cntlid = ctrlr->cdata.cntlid;
+ } else {
+ /*
+ * Fabrics controllers should already have CNTLID from the Connect command.
+ *
+ * If CNTLID from Connect doesn't match CNTLID in the Identify Controller data,
+ * trust the one from Connect.
+ */
+ if (ctrlr->cntlid != ctrlr->cdata.cntlid) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME,
+ "Identify CNTLID 0x%04" PRIx16 " != Connect CNTLID 0x%04" PRIx16 "\n",
+ ctrlr->cdata.cntlid, ctrlr->cntlid);
+ }
+ }
+
+ if (ctrlr->cdata.sgls.supported) {
+ ctrlr->flags |= SPDK_NVME_CTRLR_SGL_SUPPORTED;
+ ctrlr->max_sges = nvme_transport_ctrlr_get_max_sges(ctrlr);
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_NUM_QUEUES, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_identify(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY, NVME_TIMEOUT_INFINITE);
+
+ rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+ &ctrlr->cdata, sizeof(ctrlr->cdata),
+ nvme_ctrlr_identify_done, ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+int
+nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_completion_poll_status status;
+ int rc;
+ uint32_t i;
+ uint32_t num_pages;
+ uint32_t next_nsid = 0;
+ uint32_t *new_ns_list = NULL;
+
+
+ /*
+ * The allocated size must be a multiple of sizeof(struct spdk_nvme_ns_list)
+ */
+ num_pages = (ctrlr->num_ns * sizeof(new_ns_list[0]) - 1) / sizeof(struct spdk_nvme_ns_list) + 1;
+ new_ns_list = spdk_dma_zmalloc(num_pages * sizeof(struct spdk_nvme_ns_list), ctrlr->page_size,
+ NULL);
+ if (!new_ns_list) {
+ SPDK_ERRLOG("Failed to allocate active_ns_list!\n");
+ return -ENOMEM;
+ }
+
+ if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 1, 0) && !(ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ /*
+ * Iterate through the pages and fetch each chunk of 1024 namespaces until
+ * there are no more active namespaces
+ */
+ for (i = 0; i < num_pages; i++) {
+ rc = nvme_ctrlr_cmd_identify(ctrlr, SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST, 0, next_nsid,
+ &new_ns_list[1024 * i], sizeof(struct spdk_nvme_ns_list),
+ nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ goto fail;
+ }
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("nvme_ctrlr_cmd_identify_active_ns_list failed!\n");
+ rc = -ENXIO;
+ goto fail;
+ }
+ next_nsid = new_ns_list[1024 * i + 1023];
+ if (next_nsid == 0) {
+ /*
+ * No more active namespaces found, no need to fetch additional chunks
+ */
+ break;
+ }
+ }
+
+ } else {
+ /*
+ * Controller doesn't support active ns list CNS 0x02 so dummy up
+ * an active ns list
+ */
+ for (i = 0; i < ctrlr->num_ns; i++) {
+ new_ns_list[i] = i + 1;
+ }
+ }
+
+ /*
+ * Now that that the list is properly setup, we can swap it in to the ctrlr and
+ * free up the previous one.
+ */
+ spdk_dma_free(ctrlr->active_ns_list);
+ ctrlr->active_ns_list = new_ns_list;
+
+ return 0;
+fail:
+ spdk_dma_free(new_ns_list);
+ return rc;
+}
+
+static void
+nvme_ctrlr_identify_ns_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ uint32_t nsid;
+ int rc;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ } else {
+ nvme_ns_set_identify_data(ns);
+ }
+
+ /* move on to the next active NS */
+ nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ID_DESCS, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+ ns->ctrlr = ctrlr;
+ ns->id = nsid;
+
+ rc = nvme_ctrlr_identify_ns_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+}
+
+static int
+nvme_ctrlr_identify_ns_async(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ struct spdk_nvme_ns_data *nsdata;
+
+ nsdata = &ctrlr->nsdata[ns->id - 1];
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS, NVME_TIMEOUT_INFINITE);
+ return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+ nsdata, sizeof(*nsdata),
+ nvme_ctrlr_identify_ns_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+ int rc;
+
+ nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ /* No active NS, move on to the next state */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ ns->ctrlr = ctrlr;
+ ns->id = nsid;
+
+ rc = nvme_ctrlr_identify_ns_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_identify_id_desc_async_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ns *ns = (struct spdk_nvme_ns *)arg;
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ uint32_t nsid;
+ int rc;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /* move on to the next active NS */
+ nsid = spdk_nvme_ctrlr_get_next_active_ns(ctrlr, ns->id);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ rc = nvme_ctrlr_identify_id_desc_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+}
+
+static int
+nvme_ctrlr_identify_id_desc_async(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS, NVME_TIMEOUT_INFINITE);
+ return nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST,
+ 0, ns->id, ns->id_desc_list, sizeof(ns->id_desc_list),
+ nvme_ctrlr_identify_id_desc_async_done, ns);
+}
+
+static int
+nvme_ctrlr_identify_id_desc_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+ int rc;
+
+ if (ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+ (ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ nsid = spdk_nvme_ctrlr_get_first_active_ns(ctrlr);
+ ns = spdk_nvme_ctrlr_get_ns(ctrlr, nsid);
+ if (ns == NULL) {
+ /* No active NS, move on to the next state */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ rc = nvme_ctrlr_identify_id_desc_async(ns);
+ if (rc) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ }
+
+ return rc;
+}
+
+static void
+nvme_ctrlr_set_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Set Features - Number of Queues failed!\n");
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_GET_NUM_QUEUES, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_num_queues(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->opts.num_io_queues > SPDK_NVME_MAX_IO_QUEUES) {
+ SPDK_NOTICELOG("Limiting requested num_io_queues %u to max %d\n",
+ ctrlr->opts.num_io_queues, SPDK_NVME_MAX_IO_QUEUES);
+ ctrlr->opts.num_io_queues = SPDK_NVME_MAX_IO_QUEUES;
+ } else if (ctrlr->opts.num_io_queues < 1) {
+ SPDK_NOTICELOG("Requested num_io_queues 0, increasing to 1\n");
+ ctrlr->opts.num_io_queues = 1;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES, NVME_TIMEOUT_INFINITE);
+
+ rc = nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->opts.num_io_queues,
+ nvme_ctrlr_set_num_queues_done, ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_get_num_queues_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ uint32_t cq_allocated, sq_allocated, min_allocated, i;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Get Features - Number of Queues failed!\n");
+ ctrlr->opts.num_io_queues = 0;
+ } else {
+ /*
+ * Data in cdw0 is 0-based.
+ * Lower 16-bits indicate number of submission queues allocated.
+ * Upper 16-bits indicate number of completion queues allocated.
+ */
+ sq_allocated = (cpl->cdw0 & 0xFFFF) + 1;
+ cq_allocated = (cpl->cdw0 >> 16) + 1;
+
+ /*
+ * For 1:1 queue mapping, set number of allocated queues to be minimum of
+ * submission and completion queues.
+ */
+ min_allocated = spdk_min(sq_allocated, cq_allocated);
+
+ /* Set number of queues to be minimum of requested and actually allocated. */
+ ctrlr->opts.num_io_queues = spdk_min(min_allocated, ctrlr->opts.num_io_queues);
+ }
+
+ ctrlr->free_io_qids = spdk_bit_array_create(ctrlr->opts.num_io_queues + 1);
+ if (ctrlr->free_io_qids == NULL) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /* Initialize list of free I/O queue IDs. QID 0 is the admin queue. */
+ spdk_bit_array_clear(ctrlr->free_io_qids, 0);
+ for (i = 1; i <= ctrlr->opts.num_io_queues; i++) {
+ spdk_bit_array_set(ctrlr->free_io_qids, i);
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONSTRUCT_NS, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_get_num_queues(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES, NVME_TIMEOUT_INFINITE);
+
+ /* Obtain the number of queues allocated using Get Features. */
+ rc = nvme_ctrlr_cmd_get_num_queues(ctrlr, nvme_ctrlr_get_num_queues_done, ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_set_keep_alive_timeout_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ uint32_t keep_alive_interval_ms;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_ERRLOG("Keep alive timeout Get Feature failed: SC %x SCT %x\n",
+ cpl->status.sc, cpl->status.sct);
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ if (ctrlr->opts.keep_alive_timeout_ms != cpl->cdw0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller adjusted keep alive timeout to %u ms\n",
+ cpl->cdw0);
+ }
+
+ ctrlr->opts.keep_alive_timeout_ms = cpl->cdw0;
+
+ keep_alive_interval_ms = ctrlr->opts.keep_alive_timeout_ms / 2;
+ if (keep_alive_interval_ms == 0) {
+ keep_alive_interval_ms = 1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Sending keep alive every %u ms\n", keep_alive_interval_ms);
+
+ ctrlr->keep_alive_interval_ticks = (keep_alive_interval_ms * spdk_get_ticks_hz()) / UINT64_C(1000);
+
+ /* Schedule the first Keep Alive to be sent as soon as possible. */
+ ctrlr->next_keep_alive_tick = spdk_get_ticks();
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_keep_alive_timeout(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->opts.keep_alive_timeout_ms == 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ if (ctrlr->cdata.kas == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Controller KAS is 0 - not enabling Keep Alive\n");
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_HOST_ID, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT, NVME_TIMEOUT_INFINITE);
+
+ /* Retrieve actual keep alive timeout, since the controller may have adjusted it. */
+ rc = spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_KEEP_ALIVE_TIMER, 0, NULL, 0,
+ nvme_ctrlr_set_keep_alive_timeout_done, ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Keep alive timeout Get Feature failed: %d\n", rc);
+ ctrlr->opts.keep_alive_timeout_ms = 0;
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_set_host_id_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ /*
+ * Treat Set Features - Host ID failure as non-fatal, since the Host ID feature
+ * is optional.
+ */
+ SPDK_WARNLOG("Set Features - Host ID failed: SC 0x%x SCT 0x%x\n",
+ cpl->status.sc, cpl->status.sct);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Set Features - Host ID was successful\n");
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_set_host_id(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint8_t *host_id;
+ uint32_t host_id_size;
+ int rc;
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ /*
+ * NVMe-oF sends the host ID during Connect and doesn't allow
+ * Set Features - Host Identifier after Connect, so we don't need to do anything here.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "NVMe-oF transport - not sending Set Features - Host ID\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ if (ctrlr->cdata.ctratt.host_id_exhid_supported) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 128-bit extended host identifier\n");
+ host_id = ctrlr->opts.extended_host_id;
+ host_id_size = sizeof(ctrlr->opts.extended_host_id);
+ } else {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Using 64-bit host identifier\n");
+ host_id = ctrlr->opts.host_id;
+ host_id_size = sizeof(ctrlr->opts.host_id);
+ }
+
+ /* If the user specified an all-zeroes host identifier, don't send the command. */
+ if (spdk_mem_all_zero(host_id, host_id_size)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME,
+ "User did not specify host ID - not sending Set Features - Host ID\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_READY, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+
+ SPDK_TRACEDUMP(SPDK_LOG_NVME, "host_id", host_id, host_id_size);
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_HOST_ID, NVME_TIMEOUT_INFINITE);
+
+ rc = nvme_ctrlr_cmd_set_host_id(ctrlr, host_id, host_id_size, nvme_ctrlr_set_host_id_done, ctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("Set Features - Host ID failed: %d\n", rc);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void
+nvme_ctrlr_destruct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->ns) {
+ uint32_t i, num_ns = ctrlr->num_ns;
+
+ for (i = 0; i < num_ns; i++) {
+ nvme_ns_destruct(&ctrlr->ns[i]);
+ }
+
+ spdk_free(ctrlr->ns);
+ ctrlr->ns = NULL;
+ ctrlr->num_ns = 0;
+ }
+
+ if (ctrlr->nsdata) {
+ spdk_free(ctrlr->nsdata);
+ ctrlr->nsdata = NULL;
+ }
+
+ spdk_dma_free(ctrlr->active_ns_list);
+ ctrlr->active_ns_list = NULL;
+}
+
+static void
+nvme_ctrlr_update_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint32_t i, nn = ctrlr->cdata.nn;
+ struct spdk_nvme_ns_data *nsdata;
+
+ for (i = 0; i < nn; i++) {
+ struct spdk_nvme_ns *ns = &ctrlr->ns[i];
+ uint32_t nsid = i + 1;
+ nsdata = &ctrlr->nsdata[nsid - 1];
+
+ if ((nsdata->ncap == 0) && spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) {
+ if (nvme_ns_construct(ns, nsid, ctrlr) != 0) {
+ continue;
+ }
+ }
+
+ if (nsdata->ncap && !spdk_nvme_ctrlr_is_active_ns(ctrlr, nsid)) {
+ nvme_ns_destruct(ns);
+ }
+ }
+}
+
+static int
+nvme_ctrlr_construct_namespaces(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc = 0;
+ uint32_t nn = ctrlr->cdata.nn;
+ uint64_t phys_addr = 0;
+
+ /* ctrlr->num_ns may be 0 (startup) or a different number of namespaces (reset),
+ * so check if we need to reallocate.
+ */
+ if (nn != ctrlr->num_ns) {
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+
+ if (nn == 0) {
+ SPDK_WARNLOG("controller has 0 namespaces\n");
+ return 0;
+ }
+
+ ctrlr->ns = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns), 64,
+ &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (ctrlr->ns == NULL) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ ctrlr->nsdata = spdk_zmalloc(nn * sizeof(struct spdk_nvme_ns_data), 64,
+ &phys_addr, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE | SPDK_MALLOC_DMA);
+ if (ctrlr->nsdata == NULL) {
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ ctrlr->num_ns = nn;
+ }
+
+ return 0;
+
+fail:
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+ return rc;
+}
+
+static void
+nvme_ctrlr_async_event_cb(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_async_event_request *aer = arg;
+ struct spdk_nvme_ctrlr *ctrlr = aer->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+ union spdk_nvme_async_event_completion event;
+ int rc;
+
+ if (cpl->status.sct == SPDK_NVME_SCT_GENERIC &&
+ cpl->status.sc == SPDK_NVME_SC_ABORTED_SQ_DELETION) {
+ /*
+ * This is simulated when controller is being shut down, to
+ * effectively abort outstanding asynchronous event requests
+ * and make sure all memory is freed. Do not repost the
+ * request in this case.
+ */
+ return;
+ }
+
+ if (cpl->status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC &&
+ cpl->status.sc == SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED) {
+ /*
+ * SPDK will only send as many AERs as the device says it supports,
+ * so this status code indicates an out-of-spec device. Do not repost
+ * the request in this case.
+ */
+ SPDK_ERRLOG("Controller appears out-of-spec for asynchronous event request\n"
+ "handling. Do not repost this AER.\n");
+ return;
+ }
+
+ event.raw = cpl->cdw0;
+ if ((event.bits.async_event_type == SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) &&
+ (event.bits.async_event_info == SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED)) {
+ rc = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (rc) {
+ return;
+ }
+ nvme_ctrlr_update_namespaces(ctrlr);
+ }
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc && active_proc->aer_cb_fn) {
+ active_proc->aer_cb_fn(active_proc->aer_cb_arg, cpl);
+ }
+
+ /*
+ * Repost another asynchronous event request to replace the one
+ * that just completed.
+ */
+ if (nvme_ctrlr_construct_and_submit_aer(ctrlr, aer)) {
+ /*
+ * We can't do anything to recover from a failure here,
+ * so just print a warning message and leave the AER unsubmitted.
+ */
+ SPDK_ERRLOG("resubmitting AER failed!\n");
+ }
+}
+
+static int
+nvme_ctrlr_construct_and_submit_aer(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_async_event_request *aer)
+{
+ struct nvme_request *req;
+
+ aer->ctrlr = ctrlr;
+ req = nvme_allocate_request_null(ctrlr->adminq, nvme_ctrlr_async_event_cb, aer);
+ aer->req = req;
+ if (req == NULL) {
+ return -1;
+ }
+
+ req->cmd.opc = SPDK_NVME_OPC_ASYNC_EVENT_REQUEST;
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static void
+nvme_ctrlr_configure_aer_done(void *arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_async_event_request *aer;
+ int rc;
+ uint32_t i;
+ struct spdk_nvme_ctrlr *ctrlr = (struct spdk_nvme_ctrlr *)arg;
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ SPDK_NOTICELOG("nvme_ctrlr_configure_aer failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+
+ /* aerl is a zero-based value, so we need to add 1 here. */
+ ctrlr->num_aers = spdk_min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl + 1));
+
+ for (i = 0; i < ctrlr->num_aers; i++) {
+ aer = &ctrlr->aer[i];
+ rc = nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+ if (rc) {
+ SPDK_ERRLOG("nvme_ctrlr_construct_and_submit_aer failed!\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return;
+ }
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES, NVME_TIMEOUT_INFINITE);
+}
+
+static int
+nvme_ctrlr_configure_aer(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_feat_async_event_configuration config;
+ int rc;
+
+ config.raw = 0;
+ config.bits.crit_warn.bits.available_spare = 1;
+ config.bits.crit_warn.bits.temperature = 1;
+ config.bits.crit_warn.bits.device_reliability = 1;
+ config.bits.crit_warn.bits.read_only = 1;
+ config.bits.crit_warn.bits.volatile_memory_backup = 1;
+
+ if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 2, 0)) {
+ if (ctrlr->cdata.oaes.ns_attribute_notices) {
+ config.bits.ns_attr_notice = 1;
+ }
+ if (ctrlr->cdata.oaes.fw_activation_notices) {
+ config.bits.fw_activation_notice = 1;
+ }
+ }
+ if (ctrlr->vs.raw >= SPDK_NVME_VERSION(1, 3, 0) && ctrlr->cdata.lpa.telemetry) {
+ config.bits.telemetry_log_notice = 1;
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+
+ rc = nvme_ctrlr_cmd_set_async_event_config(ctrlr, config,
+ nvme_ctrlr_configure_aer_done,
+ ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ERROR, NVME_TIMEOUT_INFINITE);
+ return rc;
+ }
+
+ return 0;
+}
+
+struct spdk_nvme_ctrlr_process *
+spdk_nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr, pid_t pid)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+ if (active_proc->pid == pid) {
+ return active_proc;
+ }
+ }
+
+ return NULL;
+}
+
+struct spdk_nvme_ctrlr_process *
+spdk_nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return spdk_nvme_ctrlr_get_process(ctrlr, getpid());
+}
+
+/**
+ * This function will be called when a process is using the controller.
+ * 1. For the primary process, it is called when constructing the controller.
+ * 2. For the secondary process, it is called at probing the controller.
+ * Note: will check whether the process is already added for the same process.
+ */
+int
+nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle)
+{
+ struct spdk_nvme_ctrlr_process *ctrlr_proc;
+ pid_t pid = getpid();
+
+ /* Check whether the process is already added or not */
+ if (spdk_nvme_ctrlr_get_process(ctrlr, pid)) {
+ return 0;
+ }
+
+ /* Initialize the per process properties for this ctrlr */
+ ctrlr_proc = spdk_zmalloc(sizeof(struct spdk_nvme_ctrlr_process),
+ 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (ctrlr_proc == NULL) {
+ SPDK_ERRLOG("failed to allocate memory to track the process props\n");
+
+ return -1;
+ }
+
+ ctrlr_proc->is_primary = spdk_process_is_primary();
+ ctrlr_proc->pid = pid;
+ STAILQ_INIT(&ctrlr_proc->active_reqs);
+ ctrlr_proc->devhandle = devhandle;
+ ctrlr_proc->ref = 0;
+ TAILQ_INIT(&ctrlr_proc->allocated_io_qpairs);
+
+ TAILQ_INSERT_TAIL(&ctrlr->active_procs, ctrlr_proc, tailq);
+
+ return 0;
+}
+
+/**
+ * This function will be called when the process detaches the controller.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_remove_process(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_ctrlr_process *proc)
+{
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+ assert(STAILQ_EMPTY(&proc->active_reqs));
+
+ TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ TAILQ_REMOVE(&ctrlr->active_procs, proc, tailq);
+
+ spdk_dma_free(proc);
+}
+
+/**
+ * This function will be called when the process exited unexpectedly
+ * in order to free any incomplete nvme request, allocated IO qpairs
+ * and allocated memory.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_ctrlr_cleanup_process(struct spdk_nvme_ctrlr_process *proc)
+{
+ struct nvme_request *req, *tmp_req;
+ struct spdk_nvme_qpair *qpair, *tmp_qpair;
+
+ STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+ STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+ assert(req->pid == proc->pid);
+
+ nvme_free_request(req);
+ }
+
+ TAILQ_FOREACH_SAFE(qpair, &proc->allocated_io_qpairs, per_process_tailq, tmp_qpair) {
+ TAILQ_REMOVE(&proc->allocated_io_qpairs, qpair, per_process_tailq);
+
+ /*
+ * The process may have been killed while some qpairs were in their
+ * completion context. Clear that flag here to allow these IO
+ * qpairs to be deleted.
+ */
+ qpair->in_completion_context = 0;
+
+ qpair->no_deletion_notification_needed = 1;
+
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ spdk_dma_free(proc);
+}
+
+/**
+ * This function will be called when destructing the controller.
+ * 1. There is no more admin request on this controller.
+ * 2. Clean up any left resource allocation when its associated process is gone.
+ */
+void
+nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc, *tmp;
+
+ /* Free all the processes' properties and make sure no pending admin IOs */
+ TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+ TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+ assert(STAILQ_EMPTY(&active_proc->active_reqs));
+
+ spdk_free(active_proc);
+ }
+}
+
+/**
+ * This function will be called when any other process attaches or
+ * detaches the controller in order to cleanup those unexpectedly
+ * terminated processes.
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static int
+nvme_ctrlr_remove_inactive_proc(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc, *tmp;
+ int active_proc_count = 0;
+
+ TAILQ_FOREACH_SAFE(active_proc, &ctrlr->active_procs, tailq, tmp) {
+ if ((kill(active_proc->pid, 0) == -1) && (errno == ESRCH)) {
+ SPDK_ERRLOG("process %d terminated unexpected\n", active_proc->pid);
+
+ TAILQ_REMOVE(&ctrlr->active_procs, active_proc, tailq);
+
+ nvme_ctrlr_cleanup_process(active_proc);
+ } else {
+ active_proc_count++;
+ }
+ }
+
+ return active_proc_count;
+}
+
+void
+nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->ref++;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ int proc_count;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ proc_count = nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->ref--;
+ assert(active_proc->ref >= 0);
+
+ /*
+ * The last active process will be removed at the end of
+ * the destruction of the controller.
+ */
+ if (active_proc->ref == 0 && proc_count != 1) {
+ nvme_ctrlr_remove_process(ctrlr, active_proc);
+ }
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+int
+nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ int ref = 0;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ nvme_ctrlr_remove_inactive_proc(ctrlr);
+
+ TAILQ_FOREACH(active_proc, &ctrlr->active_procs, tailq) {
+ ref += active_proc->ref;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return ref;
+}
+
+/**
+ * Get the PCI device handle which is only visible to its associated process.
+ */
+struct spdk_pci_device *
+nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+ struct spdk_pci_device *devhandle = NULL;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ devhandle = active_proc->devhandle;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return devhandle;
+}
+
+static void
+nvme_ctrlr_enable_admin_queue(struct spdk_nvme_ctrlr *ctrlr)
+{
+ nvme_transport_qpair_reset(ctrlr->adminq);
+ nvme_qpair_enable(ctrlr->adminq);
+}
+
+/**
+ * This function will be called repeatedly during initialization until the controller is ready.
+ */
+int
+nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_cc_register cc;
+ union spdk_nvme_csts_register csts;
+ uint32_t ready_timeout_in_ms;
+ int rc = 0;
+
+ /*
+ * May need to avoid accessing any register on the target controller
+ * for a while. Return early without touching the FSM.
+ * Check sleep_timeout_tsc > 0 for unit test.
+ */
+ if ((ctrlr->sleep_timeout_tsc > 0) &&
+ (spdk_get_ticks() <= ctrlr->sleep_timeout_tsc)) {
+ return 0;
+ }
+ ctrlr->sleep_timeout_tsc = 0;
+
+ if (nvme_ctrlr_get_cc(ctrlr, &cc) ||
+ nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE) {
+ /* While a device is resetting, it may be unable to service MMIO reads
+ * temporarily. Allow for this case.
+ */
+ SPDK_ERRLOG("Get registers failed while waiting for CSTS.RDY == 0\n");
+ goto init_timeout;
+ }
+ SPDK_ERRLOG("Failed to read CC and CSTS in state %d\n", ctrlr->state);
+ nvme_ctrlr_fail(ctrlr, false);
+ return -EIO;
+ }
+
+ ready_timeout_in_ms = 500 * ctrlr->cap.bits.to;
+
+ /*
+ * Check if the current initialization step is done or has timed out.
+ */
+ switch (ctrlr->state) {
+ case NVME_CTRLR_STATE_INIT_DELAY:
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, ready_timeout_in_ms);
+ /*
+ * Controller may need some delay before it's enabled.
+ *
+ * This is a workaround for an issue where the PCIe-attached NVMe controller
+ * is not ready after VFIO reset. We delay the initialization rather than the
+ * enabling itself, because this is required only for the very first enabling
+ * - directly after a VFIO reset.
+ *
+ * TODO: Figure out what is actually going wrong.
+ */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Adding 2 second delay before initializing the controller\n");
+ ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2000 * spdk_get_ticks_hz() / 1000);
+ break;
+
+ case NVME_CTRLR_STATE_INIT:
+ /* Begin the hardware initialization by making sure the controller is disabled. */
+ if (cc.bits.en) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1\n");
+ /*
+ * Controller is currently enabled. We need to disable it to cause a reset.
+ *
+ * If CC.EN = 1 && CSTS.RDY = 0, the controller is in the process of becoming ready.
+ * Wait for the ready bit to be 1 before disabling the controller.
+ */
+ if (csts.bits.rdy == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 0 - waiting for reset to complete\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+ return 0;
+ }
+
+ /* CC.EN = 1 && CSTS.RDY == 1, so we can immediately disable the controller. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+ cc.bits.en = 0;
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ nvme_ctrlr_fail(ctrlr, false);
+ return -EIO;
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+
+ /*
+ * Wait 2.5 seconds before accessing PCI registers.
+ * Not using sleep() to avoid blocking other controller's initialization.
+ */
+ if (ctrlr->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Applying quirk: delay 2.5 seconds before reading registers\n");
+ ctrlr->sleep_timeout_tsc = spdk_get_ticks() + (2500 * spdk_get_ticks_hz() / 1000);
+ }
+ return 0;
+ } else {
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 1 - waiting for shutdown to complete\n");
+ }
+
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1:
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - disabling controller\n");
+ /* CC.EN = 1 && CSTS.RDY = 1, so we can set CC.EN = 0 now. */
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 0\n");
+ cc.bits.en = 0;
+ if (nvme_ctrlr_set_cc(ctrlr, &cc)) {
+ SPDK_ERRLOG("set_cc() failed\n");
+ nvme_ctrlr_fail(ctrlr, false);
+ return -EIO;
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0, ready_timeout_in_ms);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0:
+ if (csts.bits.rdy == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 0 && CSTS.RDY = 0\n");
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE, ready_timeout_in_ms);
+ /*
+ * Delay 100us before setting CC.EN = 1. Some NVMe SSDs miss CC.EN getting
+ * set to 1 if it is too soon after CSTS.RDY is reported as 0.
+ */
+ spdk_delay_us(100);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_ENABLE:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Setting CC.EN = 1\n");
+ rc = nvme_ctrlr_enable(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1, ready_timeout_in_ms);
+ return rc;
+
+ case NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1:
+ if (csts.bits.rdy == 1) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CC.EN = 1 && CSTS.RDY = 1 - controller is ready\n");
+ /*
+ * The controller has been enabled.
+ * Perform the rest of initialization serially.
+ */
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE, NVME_TIMEOUT_INFINITE);
+ return 0;
+ }
+ break;
+
+ case NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE:
+ nvme_ctrlr_enable_admin_queue(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY:
+ rc = nvme_ctrlr_identify(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_NUM_QUEUES:
+ rc = nvme_ctrlr_set_num_queues(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_GET_NUM_QUEUES:
+ rc = nvme_ctrlr_get_num_queues(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_CONSTRUCT_NS:
+ rc = nvme_ctrlr_construct_namespaces(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS:
+ rc = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (rc < 0) {
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+ }
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_IDENTIFY_NS, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_NS:
+ rc = nvme_ctrlr_identify_namespaces(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_IDENTIFY_ID_DESCS:
+ rc = nvme_ctrlr_identify_id_desc_namespaces(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_CONFIGURE_AER, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_CONFIGURE_AER:
+ rc = nvme_ctrlr_configure_aer(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES:
+ rc = nvme_ctrlr_set_supported_log_pages(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES:
+ nvme_ctrlr_set_supported_features(ctrlr);
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_SET_DB_BUF_CFG, NVME_TIMEOUT_INFINITE);
+ break;
+
+ case NVME_CTRLR_STATE_SET_DB_BUF_CFG:
+ rc = nvme_ctrlr_set_doorbell_buffer_config(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT:
+ rc = nvme_ctrlr_set_keep_alive_timeout(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_SET_HOST_ID:
+ rc = nvme_ctrlr_set_host_id(ctrlr);
+ break;
+
+ case NVME_CTRLR_STATE_WAIT_FOR_HOST_ID:
+ spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ break;
+
+ case NVME_CTRLR_STATE_READY:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Ctrlr already in ready state\n");
+ return 0;
+
+ case NVME_CTRLR_STATE_ERROR:
+ SPDK_ERRLOG("Ctrlr %s is in error state\n", ctrlr->trid.traddr);
+ return -1;
+
+ default:
+ assert(0);
+ nvme_ctrlr_fail(ctrlr, false);
+ return -1;
+ }
+
+init_timeout:
+ if (ctrlr->state_timeout_tsc != NVME_TIMEOUT_INFINITE &&
+ spdk_get_ticks() > ctrlr->state_timeout_tsc) {
+ SPDK_ERRLOG("Initialization timed out in state %d\n", ctrlr->state);
+ nvme_ctrlr_fail(ctrlr, false);
+ return -1;
+ }
+
+ return rc;
+}
+
+int
+nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx)
+{
+ pthread_mutexattr_t attr;
+ int rc = 0;
+
+ if (pthread_mutexattr_init(&attr)) {
+ return -1;
+ }
+ if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE) ||
+#ifndef __FreeBSD__
+ pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST) ||
+ pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED) ||
+#endif
+ pthread_mutex_init(mtx, &attr)) {
+ rc = -1;
+ }
+ pthread_mutexattr_destroy(&attr);
+ return rc;
+}
+
+int
+nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ if (ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE) {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT_DELAY, NVME_TIMEOUT_INFINITE);
+ } else {
+ nvme_ctrlr_set_state(ctrlr, NVME_CTRLR_STATE_INIT, NVME_TIMEOUT_INFINITE);
+ }
+
+ ctrlr->flags = 0;
+ ctrlr->free_io_qids = NULL;
+ ctrlr->is_resetting = false;
+ ctrlr->is_failed = false;
+
+ TAILQ_INIT(&ctrlr->active_io_qpairs);
+ STAILQ_INIT(&ctrlr->queued_aborts);
+ ctrlr->outstanding_aborts = 0;
+
+ rc = nvme_robust_mutex_init_recursive_shared(&ctrlr->ctrlr_lock);
+ if (rc != 0) {
+ return rc;
+ }
+
+ TAILQ_INIT(&ctrlr->active_procs);
+
+ return rc;
+}
+
+/* This function should be called once at ctrlr initialization to set up constant properties. */
+void
+nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+ const union spdk_nvme_vs_register *vs)
+{
+ ctrlr->cap = *cap;
+ ctrlr->vs = *vs;
+
+ ctrlr->min_page_size = 1u << (12 + ctrlr->cap.bits.mpsmin);
+
+ /* For now, always select page_size == min_page_size. */
+ ctrlr->page_size = ctrlr->min_page_size;
+
+ ctrlr->opts.io_queue_size = spdk_max(ctrlr->opts.io_queue_size, SPDK_NVME_IO_QUEUE_MIN_ENTRIES);
+ ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, MAX_IO_QUEUE_ENTRIES);
+ ctrlr->opts.io_queue_size = spdk_min(ctrlr->opts.io_queue_size, ctrlr->cap.bits.mqes + 1u);
+
+ ctrlr->opts.io_queue_requests = spdk_max(ctrlr->opts.io_queue_requests, ctrlr->opts.io_queue_size);
+}
+
+void
+nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr)
+{
+ pthread_mutex_destroy(&ctrlr->ctrlr_lock);
+}
+
+void
+nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct spdk_nvme_qpair *qpair, *tmp;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Prepare to destruct SSD: %s\n", ctrlr->trid.traddr);
+ TAILQ_FOREACH_SAFE(qpair, &ctrlr->active_io_qpairs, tailq, tmp) {
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+
+ nvme_ctrlr_free_doorbell_buffer(ctrlr);
+
+ nvme_ctrlr_shutdown(ctrlr);
+
+ nvme_ctrlr_destruct_namespaces(ctrlr);
+
+ spdk_bit_array_free(&ctrlr->free_io_qids);
+
+ nvme_transport_ctrlr_destruct(ctrlr);
+}
+
+int
+nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_request *req)
+{
+ return nvme_qpair_submit_request(ctrlr->adminq, req);
+}
+
+static void
+nvme_keep_alive_completion(void *cb_ctx, const struct spdk_nvme_cpl *cpl)
+{
+ /* Do nothing */
+}
+
+/*
+ * Check if we need to send a Keep Alive command.
+ * Caller must hold ctrlr->ctrlr_lock.
+ */
+static void
+nvme_ctrlr_keep_alive(struct spdk_nvme_ctrlr *ctrlr)
+{
+ uint64_t now;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ now = spdk_get_ticks();
+ if (now < ctrlr->next_keep_alive_tick) {
+ return;
+ }
+
+ req = nvme_allocate_request_null(ctrlr->adminq, nvme_keep_alive_completion, NULL);
+ if (req == NULL) {
+ return;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_KEEP_ALIVE;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ if (rc != 0) {
+ SPDK_ERRLOG("Submitting Keep Alive failed\n");
+ }
+
+ ctrlr->next_keep_alive_tick = now + ctrlr->keep_alive_interval_ticks;
+}
+
+int32_t
+spdk_nvme_ctrlr_process_admin_completions(struct spdk_nvme_ctrlr *ctrlr)
+{
+ int32_t num_completions;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ if (ctrlr->keep_alive_interval_ticks) {
+ nvme_ctrlr_keep_alive(ctrlr);
+ }
+ num_completions = spdk_nvme_qpair_process_completions(ctrlr->adminq, 0);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return num_completions;
+}
+
+const struct spdk_nvme_ctrlr_data *
+spdk_nvme_ctrlr_get_data(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return &ctrlr->cdata;
+}
+
+union spdk_nvme_csts_register spdk_nvme_ctrlr_get_regs_csts(struct spdk_nvme_ctrlr *ctrlr)
+{
+ union spdk_nvme_csts_register csts;
+
+ if (nvme_ctrlr_get_csts(ctrlr, &csts)) {
+ csts.raw = 0xFFFFFFFFu;
+ }
+ return csts;
+}
+
+union spdk_nvme_cap_register spdk_nvme_ctrlr_get_regs_cap(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->cap;
+}
+
+union spdk_nvme_vs_register spdk_nvme_ctrlr_get_regs_vs(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->vs;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_num_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->num_ns;
+}
+
+static int32_t
+spdk_nvme_ctrlr_active_ns_idx(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ int32_t result = -1;
+
+ if (ctrlr->active_ns_list == NULL || nsid == 0 || nsid > ctrlr->num_ns) {
+ return result;
+ }
+
+ int32_t lower = 0;
+ int32_t upper = ctrlr->num_ns - 1;
+ int32_t mid;
+
+ while (lower <= upper) {
+ mid = lower + (upper - lower) / 2;
+ if (ctrlr->active_ns_list[mid] == nsid) {
+ result = mid;
+ break;
+ } else {
+ if (ctrlr->active_ns_list[mid] != 0 && ctrlr->active_ns_list[mid] < nsid) {
+ lower = mid + 1;
+ } else {
+ upper = mid - 1;
+ }
+
+ }
+ }
+
+ return result;
+}
+
+bool
+spdk_nvme_ctrlr_is_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ return spdk_nvme_ctrlr_active_ns_idx(ctrlr, nsid) != -1;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_first_active_ns(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->active_ns_list ? ctrlr->active_ns_list[0] : 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_next_active_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t prev_nsid)
+{
+ int32_t nsid_idx = spdk_nvme_ctrlr_active_ns_idx(ctrlr, prev_nsid);
+ if (ctrlr->active_ns_list && nsid_idx >= 0 && (uint32_t)nsid_idx < ctrlr->num_ns - 1) {
+ return ctrlr->active_ns_list[nsid_idx + 1];
+ }
+ return 0;
+}
+
+struct spdk_nvme_ns *
+spdk_nvme_ctrlr_get_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ if (nsid < 1 || nsid > ctrlr->num_ns) {
+ return NULL;
+ }
+
+ return &ctrlr->ns[nsid - 1];
+}
+
+struct spdk_pci_device *
+spdk_nvme_ctrlr_get_pci_device(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr == NULL) {
+ return NULL;
+ }
+
+ if (ctrlr->trid.trtype != SPDK_NVME_TRANSPORT_PCIE) {
+ return NULL;
+ }
+
+ return nvme_ctrlr_proc_get_devhandle(ctrlr);
+}
+
+uint32_t
+spdk_nvme_ctrlr_get_max_xfer_size(const struct spdk_nvme_ctrlr *ctrlr)
+{
+ return ctrlr->max_xfer_size;
+}
+
+void
+spdk_nvme_ctrlr_register_aer_callback(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_aer_cb aer_cb_fn,
+ void *aer_cb_arg)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->aer_cb_fn = aer_cb_fn;
+ active_proc->aer_cb_arg = aer_cb_arg;
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+void
+spdk_nvme_ctrlr_register_timeout_callback(struct spdk_nvme_ctrlr *ctrlr,
+ uint64_t timeout_us, spdk_nvme_timeout_cb cb_fn, void *cb_arg)
+{
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (active_proc) {
+ active_proc->timeout_ticks = timeout_us * spdk_get_ticks_hz() / 1000000ULL;
+ active_proc->timeout_cb_fn = cb_fn;
+ active_proc->timeout_cb_arg = cb_arg;
+ }
+
+ ctrlr->timeout_enabled = true;
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+}
+
+bool
+spdk_nvme_ctrlr_is_log_page_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page)
+{
+ /* No bounds check necessary, since log_page is uint8_t and log_page_supported has 256 entries */
+ SPDK_STATIC_ASSERT(sizeof(ctrlr->log_page_supported) == 256, "log_page_supported size mismatch");
+ return ctrlr->log_page_supported[log_page];
+}
+
+bool
+spdk_nvme_ctrlr_is_feature_supported(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature_code)
+{
+ /* No bounds check necessary, since feature_code is uint8_t and feature_supported has 256 entries */
+ SPDK_STATIC_ASSERT(sizeof(ctrlr->feature_supported) == 256, "feature_supported size mismatch");
+ return ctrlr->feature_supported[feature_code];
+}
+
+int
+spdk_nvme_ctrlr_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload)
+{
+ struct nvme_completion_poll_status status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ res = nvme_ctrlr_cmd_attach_ns(ctrlr, nsid, payload,
+ nvme_completion_poll_cb, &status);
+ if (res) {
+ return res;
+ }
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_attach_ns failed!\n");
+ return -ENXIO;
+ }
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ return nvme_ns_construct(ns, nsid, ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload)
+{
+ struct nvme_completion_poll_status status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ res = nvme_ctrlr_cmd_detach_ns(ctrlr, nsid, payload,
+ nvme_completion_poll_cb, &status);
+ if (res) {
+ return res;
+ }
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_detach_ns failed!\n");
+ return -ENXIO;
+ }
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ /* Inactive NS */
+ nvme_ns_destruct(ns);
+
+ return 0;
+}
+
+uint32_t
+spdk_nvme_ctrlr_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload)
+{
+ struct nvme_completion_poll_status status;
+ int res;
+ uint32_t nsid;
+ struct spdk_nvme_ns *ns;
+
+ res = nvme_ctrlr_cmd_create_ns(ctrlr, payload, nvme_completion_poll_cb, &status);
+ if (res) {
+ return 0;
+ }
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_create_ns failed!\n");
+ return 0;
+ }
+
+ nsid = status.cpl.cdw0;
+ ns = &ctrlr->ns[nsid - 1];
+ /* Inactive NS */
+ res = nvme_ns_construct(ns, nsid, ctrlr);
+ if (res) {
+ return 0;
+ }
+
+ /* Return the namespace ID that was created */
+ return nsid;
+}
+
+int
+spdk_nvme_ctrlr_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid)
+{
+ struct nvme_completion_poll_status status;
+ int res;
+ struct spdk_nvme_ns *ns;
+
+ res = nvme_ctrlr_cmd_delete_ns(ctrlr, nsid, nvme_completion_poll_cb, &status);
+ if (res) {
+ return res;
+ }
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_delete_ns failed!\n");
+ return -ENXIO;
+ }
+
+ res = nvme_ctrlr_identify_active_ns(ctrlr);
+ if (res) {
+ return res;
+ }
+
+ ns = &ctrlr->ns[nsid - 1];
+ nvme_ns_destruct(ns);
+
+ return 0;
+}
+
+int
+spdk_nvme_ctrlr_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_format *format)
+{
+ struct nvme_completion_poll_status status;
+ int res;
+
+ res = nvme_ctrlr_cmd_format(ctrlr, nsid, format, nvme_completion_poll_cb,
+ &status);
+ if (res) {
+ return res;
+ }
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_format failed!\n");
+ return -ENXIO;
+ }
+
+ return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+int
+spdk_nvme_ctrlr_update_firmware(struct spdk_nvme_ctrlr *ctrlr, void *payload, uint32_t size,
+ int slot, enum spdk_nvme_fw_commit_action commit_action, struct spdk_nvme_status *completion_status)
+{
+ struct spdk_nvme_fw_commit fw_commit;
+ struct nvme_completion_poll_status status;
+ int res;
+ unsigned int size_remaining;
+ unsigned int offset;
+ unsigned int transfer;
+ void *p;
+
+ if (!completion_status) {
+ return -EINVAL;
+ }
+ memset(completion_status, 0, sizeof(struct spdk_nvme_status));
+ if (size % 4) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid size!\n");
+ return -1;
+ }
+
+ /* Current support only for SPDK_NVME_FW_COMMIT_REPLACE_IMG
+ * and SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG
+ */
+ if ((commit_action != SPDK_NVME_FW_COMMIT_REPLACE_IMG) &&
+ (commit_action != SPDK_NVME_FW_COMMIT_REPLACE_AND_ENABLE_IMG)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_update_firmware invalid command!\n");
+ return -1;
+ }
+
+ /* Firmware download */
+ size_remaining = size;
+ offset = 0;
+ p = payload;
+
+ while (size_remaining > 0) {
+ transfer = spdk_min(size_remaining, ctrlr->min_page_size);
+
+ res = nvme_ctrlr_cmd_fw_image_download(ctrlr, transfer, offset, p,
+ nvme_completion_poll_cb,
+ &status);
+ if (res) {
+ return res;
+ }
+
+ if (spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock)) {
+ SPDK_ERRLOG("spdk_nvme_ctrlr_fw_image_download failed!\n");
+ return -ENXIO;
+ }
+ p += transfer;
+ offset += transfer;
+ size_remaining -= transfer;
+ }
+
+ /* Firmware commit */
+ memset(&fw_commit, 0, sizeof(struct spdk_nvme_fw_commit));
+ fw_commit.fs = slot;
+ fw_commit.ca = commit_action;
+
+ res = nvme_ctrlr_cmd_fw_commit(ctrlr, &fw_commit, nvme_completion_poll_cb,
+ &status);
+ if (res) {
+ return res;
+ }
+
+ res = spdk_nvme_wait_for_completion_robust_lock(ctrlr->adminq, &status, &ctrlr->ctrlr_lock);
+
+ memcpy(completion_status, &status.cpl.status, sizeof(struct spdk_nvme_status));
+
+ if (res) {
+ if (status.cpl.status.sct != SPDK_NVME_SCT_COMMAND_SPECIFIC ||
+ status.cpl.status.sc != SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET) {
+ if (status.cpl.status.sct == SPDK_NVME_SCT_COMMAND_SPECIFIC &&
+ status.cpl.status.sc == SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET) {
+ SPDK_NOTICELOG("firmware activation requires conventional reset to be performed. !\n");
+ } else {
+ SPDK_ERRLOG("nvme_ctrlr_cmd_fw_commit failed!\n");
+ }
+ return -ENXIO;
+ }
+ }
+
+ return spdk_nvme_ctrlr_reset(ctrlr);
+}
+
+void *
+spdk_nvme_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
+{
+ void *buf;
+
+ if (size == 0) {
+ return NULL;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ buf = nvme_transport_ctrlr_alloc_cmb_io_buffer(ctrlr, size);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return buf;
+}
+
+void
+spdk_nvme_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
+{
+ if (buf && size) {
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ nvme_transport_ctrlr_free_cmb_io_buffer(ctrlr, buf, size);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ }
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
new file mode 100644
index 00000000..750a2d78
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_cmd.c
@@ -0,0 +1,694 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ctrlr_cmd_io_raw(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+
+ req = nvme_allocate_request_contig(qpair, buf, len, cb_fn, cb_arg);
+
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_io_raw_with_md(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len, void *md_buf,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buf, md_buf);
+
+ req = nvme_allocate_request(qpair, &payload, len, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ctrlr_cmd_admin_raw(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_cmd *cmd,
+ void *buf, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_contig(ctrlr->adminq, buf, len, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ memcpy(&req->cmd, cmd, sizeof(req->cmd));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr, uint8_t cns, uint16_t cntid, uint32_t nsid,
+ void *payload, size_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size,
+ cb_fn, cb_arg, false);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_IDENTIFY;
+ cmd->cdw10 = cns | ((uint32_t)cntid << 16);
+ cmd->nsid = nsid;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+int
+nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ctrlr_list),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+ cmd->nsid = nsid;
+ cmd->cdw10 = SPDK_NVME_NS_CTRLR_ATTACH;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ctrlr_list),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_ATTACHMENT;
+ cmd->nsid = nsid;
+ cmd->cdw10 = SPDK_NVME_NS_CTRLR_DETACH;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, sizeof(struct spdk_nvme_ns_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+ cmd->cdw10 = SPDK_NVME_NS_MANAGEMENT_CREATE;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_NS_MANAGEMENT;
+ cmd->cdw10 = SPDK_NVME_NS_MANAGEMENT_DELETE;
+ cmd->nsid = nsid;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr, uint64_t prp1, uint64_t prp2,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG;
+ cmd->dptr.prp.prp1 = prp1;
+ cmd->dptr.prp.prp2 = prp2;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, struct spdk_nvme_format *format,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FORMAT_NVM;
+ cmd->nsid = nsid;
+ memcpy(&cmd->cdw10, format, sizeof(uint32_t));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_set_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, uint32_t cdw12, void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+ cmd->cdw10 = feature;
+ cmd->cdw11 = cdw11;
+ cmd->cdw12 = cdw12;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+ cmd->cdw10 = feature;
+ cmd->cdw11 = cdw11;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t ns_id)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_FEATURES;
+ cmd->cdw10 = feature;
+ cmd->cdw11 = cdw11;
+ cmd->nsid = ns_id;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int spdk_nvme_ctrlr_cmd_set_feature_ns(struct spdk_nvme_ctrlr *ctrlr, uint8_t feature,
+ uint32_t cdw11, uint32_t cdw12, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t ns_id)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size, cb_fn, cb_arg,
+ true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SET_FEATURES;
+ cmd->cdw10 = feature;
+ cmd->cdw11 = cdw11;
+ cmd->cdw12 = cdw12;
+ cmd->nsid = ns_id;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t num_queues, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ uint32_t cdw11;
+
+ cdw11 = ((num_queues - 1) << 16) | (num_queues - 1);
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, cdw11, 0,
+ NULL, 0, cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ return spdk_nvme_ctrlr_cmd_get_feature(ctrlr, SPDK_NVME_FEAT_NUMBER_OF_QUEUES, 0, NULL, 0,
+ cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+ union spdk_nvme_feat_async_event_configuration config, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ uint32_t cdw11;
+
+ cdw11 = config.raw;
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION, cdw11, 0,
+ NULL, 0,
+ cb_fn, cb_arg);
+}
+
+int
+nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ uint32_t cdw11;
+
+ if (host_id_size == 16) {
+ /* 128-bit extended host identifier */
+ cdw11 = 1;
+ } else if (host_id_size == 8) {
+ /* 64-bit host identifier */
+ cdw11 = 0;
+ } else {
+ SPDK_ERRLOG("Invalid host ID size %u\n", host_id_size);
+ return -EINVAL;
+ }
+
+ return spdk_nvme_ctrlr_cmd_set_feature(ctrlr, SPDK_NVME_FEAT_HOST_IDENTIFIER, cdw11, 0,
+ host_id, host_id_size, cb_fn, cb_arg);
+}
+
+int
+spdk_nvme_ctrlr_cmd_get_log_page(struct spdk_nvme_ctrlr *ctrlr, uint8_t log_page,
+ uint32_t nsid, void *payload, uint32_t payload_size,
+ uint64_t offset, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint32_t numd, numdl, numdu;
+ uint32_t lpol, lpou;
+ int rc;
+
+ if (payload_size == 0) {
+ return -EINVAL;
+ }
+
+ if (offset & 3) {
+ return -EINVAL;
+ }
+
+ numd = payload_size / sizeof(uint32_t) - 1u;
+ numdl = numd & 0xFFFFu;
+ numdu = (numd >> 16) & 0xFFFFu;
+
+ lpol = (uint32_t)offset;
+ lpou = (uint32_t)(offset >> 32);
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+
+ if (offset && !ctrlr->cdata.lpa.edlp) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_GET_LOG_PAGE;
+ cmd->nsid = nsid;
+ cmd->cdw10 = numdl << 16;
+ cmd->cdw10 |= log_page;
+ cmd->cdw11 = numdu;
+ cmd->cdw12 = lpol;
+ cmd->cdw13 = lpou;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+static void
+spdk_nvme_ctrlr_cmd_abort_cpl(void *ctx, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *req, *next, *tmp;
+ struct spdk_nvme_ctrlr *ctrlr;
+ int rc;
+
+ req = ctx;
+ ctrlr = (struct spdk_nvme_ctrlr *)req->user_buffer;
+
+ ctrlr->outstanding_aborts--;
+ STAILQ_FOREACH_SAFE(next, &ctrlr->queued_aborts, stailq, tmp) {
+ STAILQ_REMOVE_HEAD(&ctrlr->queued_aborts, stailq);
+ ctrlr->outstanding_aborts++;
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, next);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to submit queued abort.\n");
+ memset(&next->cpl, 0, sizeof(next->cpl));
+ next->cpl.status.sct = SPDK_NVME_SCT_GENERIC;
+ next->cpl.status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR;
+ next->cpl.status.dnr = 1;
+ nvme_complete_request(next, &req->cpl);
+ nvme_free_request(next);
+ } else {
+ /* If the first abort succeeds, stop iterating. */
+ break;
+ }
+ }
+
+ req->user_cb_fn(req->user_cb_arg, cpl);
+}
+
+int
+spdk_nvme_ctrlr_cmd_abort(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ uint16_t cid, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ int rc;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint16_t sqid;
+
+ if (qpair) {
+ sqid = qpair->id;
+ } else {
+ sqid = ctrlr->adminq->id; /* 0 */
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, spdk_nvme_ctrlr_cmd_abort_cpl, NULL);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+ req->cb_arg = req;
+ req->user_cb_fn = cb_fn;
+ req->user_cb_arg = cb_arg;
+ req->user_buffer = ctrlr; /* This is a hack to get to the ctrlr in the
+ * completion handler. */
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_ABORT;
+ cmd->cdw10 = (cid << 16) | sqid;
+
+ if (ctrlr->outstanding_aborts >= ctrlr->cdata.acl) {
+ STAILQ_INSERT_TAIL(&ctrlr->queued_aborts, req, stailq);
+ rc = 0;
+ } else {
+ ctrlr->outstanding_aborts++;
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ }
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return rc;
+}
+
+int
+nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_fw_commit *fw_commit,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FIRMWARE_COMMIT;
+ memcpy(&cmd->cdw10, fw_commit, sizeof(uint32_t));
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+
+}
+
+int
+nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t size, uint32_t offset, void *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, size, cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD;
+ cmd->cdw10 = (size >> 2) - 1;
+ cmd->cdw11 = offset >> 2;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_receive(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+ cb_fn, cb_arg, false);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SECURITY_RECEIVE;
+ cmd->cdw10 = ((uint32_t)secp << 24) | ((uint32_t)spsp << 8) | ((uint32_t)nssf);
+ cmd->cdw11 = payload_size;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
+
+int
+spdk_nvme_ctrlr_cmd_security_send(struct spdk_nvme_ctrlr *ctrlr, uint8_t secp,
+ uint16_t spsp, uint8_t nssf, void *payload,
+ uint32_t payload_size, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ int rc;
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq, payload, payload_size,
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_SECURITY_SEND;
+ cmd->cdw10 = ((uint32_t)secp << 24) | ((uint32_t)spsp << 8) | ((uint32_t)nssf);
+ cmd->cdw11 = payload_size;
+
+ rc = nvme_ctrlr_submit_admin_request(ctrlr, req);
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return rc;
+}
diff --git a/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
new file mode 100644
index 00000000..80de5328
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ctrlr_ocssd_cmd.c
@@ -0,0 +1,83 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+bool
+spdk_nvme_ctrlr_is_ocssd_supported(struct spdk_nvme_ctrlr *ctrlr)
+{
+ if (ctrlr->quirks & NVME_QUIRK_OCSSD) {
+ // TODO: There isn't a standardized way to identify Open-Channel SSD
+ // different verdors may have different conditions.
+
+ /*
+ * Current QEMU OpenChannel Device needs to check nsdata->vs[0].
+ * Here check nsdata->vs[0] of the first namespace.
+ */
+ if (ctrlr->cdata.vid == SPDK_PCI_VID_CNEXLABS) {
+ if (ctrlr->num_ns && ctrlr->nsdata[0].vendor_specific[0] == 0x1) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+int
+spdk_nvme_ocssd_ctrlr_cmd_geometry(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ void *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (!payload || (payload_size != sizeof(struct spdk_ocssd_geometry_data))) {
+ return -EINVAL;
+ }
+
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ req = nvme_allocate_request_user_copy(ctrlr->adminq,
+ payload, payload_size, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_GEOMETRY;
+ cmd->nsid = nsid;
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_fabric.c b/src/spdk/lib/nvme/nvme_fabric.c
new file mode 100644
index 00000000..4589596a
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_fabric.c
@@ -0,0 +1,340 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over Fabrics transport-independent functions
+ */
+
+#include "nvme_internal.h"
+
+#include "spdk/endian.h"
+#include "spdk/string.h"
+
+static int
+nvme_fabric_prop_set_cmd(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t offset, uint8_t size, uint64_t value)
+{
+ struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+ struct nvme_completion_poll_status status;
+ int rc;
+
+ assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET;
+ cmd.ofst = offset;
+ cmd.attrib.size = size;
+ cmd.value.u64 = value;
+
+ rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+ NULL, 0,
+ nvme_completion_poll_cb, &status);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("Property Set failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_fabric_prop_get_cmd(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t offset, uint8_t size, uint64_t *value)
+{
+ struct spdk_nvmf_fabric_prop_set_cmd cmd = {};
+ struct nvme_completion_poll_status status;
+ struct spdk_nvmf_fabric_prop_get_rsp *response;
+ int rc;
+
+ assert(size == SPDK_NVMF_PROP_SIZE_4 || size == SPDK_NVMF_PROP_SIZE_8);
+
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET;
+ cmd.ofst = offset;
+ cmd.attrib.size = size;
+
+ rc = spdk_nvme_ctrlr_cmd_admin_raw(ctrlr, (struct spdk_nvme_cmd *)&cmd,
+ NULL, 0, nvme_completion_poll_cb,
+ &status);
+ if (rc < 0) {
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("Property Get failed\n");
+ return -1;
+ }
+
+ response = (struct spdk_nvmf_fabric_prop_get_rsp *)&status.cpl;
+
+ if (size == SPDK_NVMF_PROP_SIZE_4) {
+ *value = response->value.u32.low;
+ } else {
+ *value = response->value.u64;
+ }
+
+ return 0;
+}
+
+int
+nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, value);
+}
+
+int
+nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ return nvme_fabric_prop_set_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+int
+nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ uint64_t tmp_value;
+ int rc;
+ rc = nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_4, &tmp_value);
+
+ if (!rc) {
+ *value = (uint32_t)tmp_value;
+ }
+ return rc;
+}
+
+int
+nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ return nvme_fabric_prop_get_cmd(ctrlr, offset, SPDK_NVMF_PROP_SIZE_8, value);
+}
+
+static void
+nvme_fabric_discover_probe(struct spdk_nvmf_discovery_log_page_entry *entry,
+ void *cb_ctx, spdk_nvme_probe_cb probe_cb)
+{
+ struct spdk_nvme_transport_id trid;
+ uint8_t *end;
+ size_t len;
+
+ memset(&trid, 0, sizeof(trid));
+
+ if (entry->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) {
+ SPDK_WARNLOG("Skipping unsupported discovery service referral\n");
+ return;
+ } else if (entry->subtype != SPDK_NVMF_SUBTYPE_NVME) {
+ SPDK_WARNLOG("Skipping unknown subtype %u\n", entry->subtype);
+ return;
+ }
+
+ trid.trtype = entry->trtype;
+ if (!spdk_nvme_transport_available(trid.trtype)) {
+ SPDK_WARNLOG("NVMe transport type %u not available; skipping probe\n",
+ trid.trtype);
+ return;
+ }
+
+ trid.adrfam = entry->adrfam;
+
+ /* Ensure that subnqn is null terminated. */
+ end = memchr(entry->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1);
+ if (!end) {
+ SPDK_ERRLOG("Discovery entry SUBNQN is not null terminated\n");
+ return;
+ }
+ len = end - entry->subnqn;
+ memcpy(trid.subnqn, entry->subnqn, len);
+ trid.subnqn[len] = '\0';
+
+ /* Convert traddr to a null terminated string. */
+ len = spdk_strlen_pad(entry->traddr, sizeof(entry->traddr), ' ');
+ memcpy(trid.traddr, entry->traddr, len);
+ if (spdk_str_chomp(trid.traddr) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRADDR\n");
+ }
+
+ /* Convert trsvcid to a null terminated string. */
+ len = spdk_strlen_pad(entry->trsvcid, sizeof(entry->trsvcid), ' ');
+ memcpy(trid.trsvcid, entry->trsvcid, len);
+ if (spdk_str_chomp(trid.trsvcid) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Trailing newlines removed from discovery TRSVCID\n");
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "subnqn=%s, trtype=%u, traddr=%s, trsvcid=%s\n",
+ trid.subnqn, trid.trtype,
+ trid.traddr, trid.trsvcid);
+
+ nvme_ctrlr_probe(&trid, NULL, probe_cb, cb_ctx);
+}
+
+static int
+nvme_fabric_get_discovery_log_page(struct spdk_nvme_ctrlr *ctrlr,
+ void *log_page, uint32_t size, uint64_t offset)
+{
+ struct nvme_completion_poll_status status;
+ int rc;
+
+ rc = spdk_nvme_ctrlr_cmd_get_log_page(ctrlr, SPDK_NVME_LOG_DISCOVERY, 0, log_page, size, offset,
+ nvme_completion_poll_cb, &status);
+ if (rc < 0) {
+ return -1;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr,
+ void *cb_ctx, spdk_nvme_probe_cb probe_cb)
+{
+ struct spdk_nvmf_discovery_log_page *log_page;
+ struct spdk_nvmf_discovery_log_page_entry *log_page_entry;
+ char buffer[4096];
+ int rc;
+ uint64_t i, numrec, buffer_max_entries_first, buffer_max_entries, log_page_offset = 0;
+ uint64_t remaining_num_rec = 0;
+ uint16_t recfmt;
+
+ memset(buffer, 0x0, 4096);
+ buffer_max_entries_first = (sizeof(buffer) - offsetof(struct spdk_nvmf_discovery_log_page,
+ entries[0])) /
+ sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ buffer_max_entries = sizeof(buffer) / sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ do {
+ rc = nvme_fabric_get_discovery_log_page(ctrlr, buffer, sizeof(buffer), log_page_offset);
+ if (rc < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Get Log Page - Discovery error\n");
+ return rc;
+ }
+
+ if (!remaining_num_rec) {
+ log_page = (struct spdk_nvmf_discovery_log_page *)buffer;
+ recfmt = from_le16(&log_page->recfmt);
+ if (recfmt != 0) {
+ SPDK_ERRLOG("Unrecognized discovery log record format %" PRIu16 "\n", recfmt);
+ return -EPROTO;
+ }
+ remaining_num_rec = log_page->numrec;
+ log_page_offset = offsetof(struct spdk_nvmf_discovery_log_page, entries[0]);
+ log_page_entry = &log_page->entries[0];
+ numrec = spdk_min(remaining_num_rec, buffer_max_entries_first);
+ } else {
+ numrec = spdk_min(remaining_num_rec, buffer_max_entries);
+ log_page_entry = (struct spdk_nvmf_discovery_log_page_entry *)buffer;
+ }
+
+ for (i = 0; i < numrec; i++) {
+ nvme_fabric_discover_probe(log_page_entry++, cb_ctx, probe_cb);
+ }
+ remaining_num_rec -= numrec;
+ log_page_offset += numrec * sizeof(struct spdk_nvmf_discovery_log_page_entry);
+ } while (remaining_num_rec != 0);
+
+ return 0;
+}
+
+int
+nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries)
+{
+ struct nvme_completion_poll_status status;
+ struct spdk_nvmf_fabric_connect_rsp *rsp;
+ struct spdk_nvmf_fabric_connect_cmd cmd;
+ struct spdk_nvmf_fabric_connect_data *nvmf_data;
+ struct spdk_nvme_ctrlr *ctrlr;
+ int rc;
+
+ if (num_entries == 0 || num_entries > SPDK_NVME_IO_QUEUE_MAX_ENTRIES) {
+ return -EINVAL;
+ }
+
+ ctrlr = qpair->ctrlr;
+ if (!ctrlr) {
+ return -EINVAL;
+ }
+
+ nvmf_data = spdk_dma_zmalloc(sizeof(*nvmf_data), 0, NULL);
+ if (!nvmf_data) {
+ SPDK_ERRLOG("nvmf_data allocation error\n");
+ return -ENOMEM;
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.opcode = SPDK_NVME_OPC_FABRIC;
+ cmd.fctype = SPDK_NVMF_FABRIC_COMMAND_CONNECT;
+ cmd.qid = qpair->id;
+ cmd.sqsize = num_entries - 1;
+ cmd.kato = ctrlr->opts.keep_alive_timeout_ms;
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvmf_data->cntlid = 0xFFFF;
+ } else {
+ nvmf_data->cntlid = ctrlr->cntlid;
+ }
+
+ SPDK_STATIC_ASSERT(sizeof(nvmf_data->hostid) == sizeof(ctrlr->opts.extended_host_id),
+ "host ID size mismatch");
+ memcpy(nvmf_data->hostid, ctrlr->opts.extended_host_id, sizeof(nvmf_data->hostid));
+ snprintf(nvmf_data->hostnqn, sizeof(nvmf_data->hostnqn), "%s", ctrlr->opts.hostnqn);
+ snprintf(nvmf_data->subnqn, sizeof(nvmf_data->subnqn), "%s", ctrlr->trid.subnqn);
+
+ rc = spdk_nvme_ctrlr_cmd_io_raw(ctrlr, qpair,
+ (struct spdk_nvme_cmd *)&cmd,
+ nvmf_data, sizeof(*nvmf_data),
+ nvme_completion_poll_cb, &status);
+ if (rc < 0) {
+ SPDK_ERRLOG("Connect command failed\n");
+ spdk_dma_free(nvmf_data);
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(qpair, &status)) {
+ SPDK_ERRLOG("Connect command failed\n");
+ spdk_dma_free(nvmf_data);
+ return -EIO;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ rsp = (struct spdk_nvmf_fabric_connect_rsp *)&status.cpl;
+ ctrlr->cntlid = rsp->status_code_specific.success.cntlid;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CNTLID 0x%04" PRIx16 "\n", ctrlr->cntlid);
+ }
+
+ spdk_dma_free(nvmf_data);
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_internal.h b/src/spdk/lib/nvme/nvme_internal.h
new file mode 100644
index 00000000..6e7714a4
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_internal.h
@@ -0,0 +1,1003 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __NVME_INTERNAL_H__
+#define __NVME_INTERNAL_H__
+
+#include "spdk/config.h"
+#include "spdk/likely.h"
+#include "spdk/stdinc.h"
+
+#include "spdk/nvme.h"
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <x86intrin.h>
+#endif
+
+#include "spdk/queue.h"
+#include "spdk/barrier.h"
+#include "spdk/bit_array.h"
+#include "spdk/mmio.h"
+#include "spdk/pci_ids.h"
+#include "spdk/util.h"
+#include "spdk/nvme_intel.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/uuid.h"
+
+#include "spdk_internal/assert.h"
+#include "spdk_internal/log.h"
+
+extern pid_t g_spdk_nvme_pid;
+
+/*
+ * Some Intel devices support vendor-unique read latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_READ_LATENCY 0x1
+
+/*
+ * Some Intel devices support vendor-unique write latency log page even
+ * though the log page directory says otherwise.
+ */
+#define NVME_INTEL_QUIRK_WRITE_LATENCY 0x2
+
+/*
+ * The controller needs a delay before starts checking the device
+ * readiness, which is done by reading the NVME_CSTS_RDY bit.
+ */
+#define NVME_QUIRK_DELAY_BEFORE_CHK_RDY 0x4
+
+/*
+ * The controller performs best when I/O is split on particular
+ * LBA boundaries.
+ */
+#define NVME_INTEL_QUIRK_STRIPING 0x8
+
+/*
+ * The controller needs a delay after allocating an I/O queue pair
+ * before it is ready to accept I/O commands.
+ */
+#define NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC 0x10
+
+/*
+ * Earlier NVMe devices do not indicate whether unmapped blocks
+ * will read all zeroes or not. This define indicates that the
+ * device does in fact read all zeroes after an unmap event
+ */
+#define NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE 0x20
+
+/*
+ * The controller doesn't handle Identify value others than 0 or 1 correctly.
+ */
+#define NVME_QUIRK_IDENTIFY_CNS 0x40
+
+/*
+ * The controller supports Open Channel command set if matching additional
+ * condition, like the first byte (value 0x1) in the vendor specific
+ * bits of the namespace identify structure is set.
+ */
+#define NVME_QUIRK_OCSSD 0x80
+
+/*
+ * The controller has an Intel vendor ID but does not support Intel vendor-specific
+ * log pages. This is primarily for QEMU emulated SSDs which report an Intel vendor
+ * ID but do not support these log pages.
+ */
+#define NVME_INTEL_QUIRK_NO_LOG_PAGES 0x100
+
+#define NVME_MAX_ASYNC_EVENTS (8)
+
+#define NVME_MIN_TIMEOUT_PERIOD (5)
+#define NVME_MAX_TIMEOUT_PERIOD (120)
+
+/* Maximum log page size to fetch for AERs. */
+#define NVME_MAX_AER_LOG_SIZE (4096)
+
+/*
+ * NVME_MAX_IO_QUEUES in nvme_spec.h defines the 64K spec-limit, but this
+ * define specifies the maximum number of queues this driver will actually
+ * try to configure, if available.
+ */
+#define DEFAULT_MAX_IO_QUEUES (1024)
+#define DEFAULT_IO_QUEUE_SIZE (256)
+
+#define DEFAULT_ADMIN_QUEUE_REQUESTS (32)
+#define DEFAULT_IO_QUEUE_REQUESTS (512)
+
+/* We want to fit submission and completion rings each in a single 2MB
+ * hugepage to ensure physical address contiguity.
+ */
+#define MAX_IO_QUEUE_ENTRIES (0x200000 / spdk_max( \
+ sizeof(struct spdk_nvme_cmd), \
+ sizeof(struct spdk_nvme_cpl)))
+
+enum nvme_payload_type {
+ NVME_PAYLOAD_TYPE_INVALID = 0,
+
+ /** nvme_request::u.payload.contig_buffer is valid for this request */
+ NVME_PAYLOAD_TYPE_CONTIG,
+
+ /** nvme_request::u.sgl is valid for this request */
+ NVME_PAYLOAD_TYPE_SGL,
+};
+
+/*
+ * Controller support flags.
+ */
+enum spdk_nvme_ctrlr_flags {
+ SPDK_NVME_CTRLR_SGL_SUPPORTED = 0x1, /**< The SGL is supported */
+};
+
+/**
+ * Descriptor for a request data payload.
+ */
+struct nvme_payload {
+ /**
+ * Functions for retrieving physical addresses for scattered payloads.
+ */
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn;
+
+ /**
+ * If reset_sgl_fn == NULL, this is a contig payload, and contig_or_cb_arg contains the
+ * virtual memory address of a single virtually contiguous buffer.
+ *
+ * If reset_sgl_fn != NULL, this is a SGL payload, and contig_or_cb_arg contains the
+ * cb_arg that will be passed to the SGL callback functions.
+ */
+ void *contig_or_cb_arg;
+
+ /** Virtual memory address of a single virtually contiguous metadata buffer */
+ void *md;
+};
+
+#define NVME_PAYLOAD_CONTIG(contig_, md_) \
+ (struct nvme_payload) { \
+ .reset_sgl_fn = NULL, \
+ .next_sge_fn = NULL, \
+ .contig_or_cb_arg = (contig_), \
+ .md = (md_), \
+ }
+
+#define NVME_PAYLOAD_SGL(reset_sgl_fn_, next_sge_fn_, cb_arg_, md_) \
+ (struct nvme_payload) { \
+ .reset_sgl_fn = (reset_sgl_fn_), \
+ .next_sge_fn = (next_sge_fn_), \
+ .contig_or_cb_arg = (cb_arg_), \
+ .md = (md_), \
+ }
+
+static inline enum nvme_payload_type
+nvme_payload_type(const struct nvme_payload *payload) {
+ return payload->reset_sgl_fn ? NVME_PAYLOAD_TYPE_SGL : NVME_PAYLOAD_TYPE_CONTIG;
+}
+
+struct nvme_error_cmd {
+ bool do_not_submit;
+ uint64_t timeout_tsc;
+ uint32_t err_count;
+ uint8_t opc;
+ struct spdk_nvme_status status;
+ TAILQ_ENTRY(nvme_error_cmd) link;
+};
+
+struct nvme_request {
+ struct spdk_nvme_cmd cmd;
+
+ uint8_t retries;
+
+ bool timed_out;
+
+ /**
+ * Number of children requests still outstanding for this
+ * request which was split into multiple child requests.
+ */
+ uint16_t num_children;
+
+ /**
+ * Offset in bytes from the beginning of payload for this request.
+ * This is used for I/O commands that are split into multiple requests.
+ */
+ uint32_t payload_offset;
+ uint32_t md_offset;
+
+ uint32_t payload_size;
+
+ /**
+ * Timeout ticks for error injection requests, can be extended in future
+ * to support per-request timeout feature.
+ */
+ uint64_t timeout_tsc;
+
+ /**
+ * Data payload for this request's command.
+ */
+ struct nvme_payload payload;
+
+ spdk_nvme_cmd_cb cb_fn;
+ void *cb_arg;
+ STAILQ_ENTRY(nvme_request) stailq;
+
+ struct spdk_nvme_qpair *qpair;
+
+ /*
+ * The value of spdk_get_ticks() when the request was submitted to the hardware.
+ * Only set if ctrlr->timeout_enabled is true.
+ */
+ uint64_t submit_tick;
+
+ /**
+ * The active admin request can be moved to a per process pending
+ * list based on the saved pid to tell which process it belongs
+ * to. The cpl saves the original completion information which
+ * is used in the completion callback.
+ * NOTE: these below two fields are only used for admin request.
+ */
+ pid_t pid;
+ struct spdk_nvme_cpl cpl;
+
+ /**
+ * The following members should not be reordered with members
+ * above. These members are only needed when splitting
+ * requests which is done rarely, and the driver is careful
+ * to not touch the following fields until a split operation is
+ * needed, to avoid touching an extra cacheline.
+ */
+
+ /**
+ * Points to the outstanding child requests for a parent request.
+ * Only valid if a request was split into multiple children
+ * requests, and is not initialized for non-split requests.
+ */
+ TAILQ_HEAD(, nvme_request) children;
+
+ /**
+ * Linked-list pointers for a child request in its parent's list.
+ */
+ TAILQ_ENTRY(nvme_request) child_tailq;
+
+ /**
+ * Points to a parent request if part of a split request,
+ * NULL otherwise.
+ */
+ struct nvme_request *parent;
+
+ /**
+ * Completion status for a parent request. Initialized to all 0's
+ * (SUCCESS) before child requests are submitted. If a child
+ * request completes with error, the error status is copied here,
+ * to ensure that the parent request is also completed with error
+ * status once all child requests are completed.
+ */
+ struct spdk_nvme_cpl parent_status;
+
+ /**
+ * The user_cb_fn and user_cb_arg fields are used for holding the original
+ * callback data when using nvme_allocate_request_user_copy.
+ */
+ spdk_nvme_cmd_cb user_cb_fn;
+ void *user_cb_arg;
+ void *user_buffer;
+};
+
+struct nvme_completion_poll_status {
+ struct spdk_nvme_cpl cpl;
+ bool done;
+};
+
+struct nvme_async_event_request {
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct nvme_request *req;
+ struct spdk_nvme_cpl cpl;
+};
+
+struct spdk_nvme_qpair {
+ STAILQ_HEAD(, nvme_request) free_req;
+ STAILQ_HEAD(, nvme_request) queued_req;
+ /** Commands opcode in this list will return error */
+ TAILQ_HEAD(, nvme_error_cmd) err_cmd_head;
+ /** Requests in this list will return error */
+ STAILQ_HEAD(, nvme_request) err_req_head;
+
+ enum spdk_nvme_transport_type trtype;
+
+ uint16_t id;
+
+ uint8_t qprio;
+
+ /*
+ * Members for handling IO qpair deletion inside of a completion context.
+ * These are specifically defined as single bits, so that they do not
+ * push this data structure out to another cacheline.
+ */
+ uint8_t in_completion_context : 1;
+ uint8_t delete_after_completion_context: 1;
+
+ /*
+ * Set when no deletion notification is needed. For example, the process
+ * which allocated this qpair exited unexpectedly.
+ */
+ uint8_t no_deletion_notification_needed: 1;
+
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ /* List entry for spdk_nvme_ctrlr::active_io_qpairs */
+ TAILQ_ENTRY(spdk_nvme_qpair) tailq;
+
+ /* List entry for spdk_nvme_ctrlr_process::allocated_io_qpairs */
+ TAILQ_ENTRY(spdk_nvme_qpair) per_process_tailq;
+
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ void *req_buf;
+};
+
+struct spdk_nvme_ns {
+ struct spdk_nvme_ctrlr *ctrlr;
+ uint32_t sector_size;
+
+ /*
+ * Size of data transferred as part of each block,
+ * including metadata if FLBAS indicates the metadata is transferred
+ * as part of the data buffer at the end of each LBA.
+ */
+ uint32_t extended_lba_size;
+
+ uint32_t md_size;
+ uint32_t pi_type;
+ uint32_t sectors_per_max_io;
+ uint32_t sectors_per_stripe;
+ uint32_t id;
+ uint16_t flags;
+
+ /* Namespace Identification Descriptor List (CNS = 03h) */
+ uint8_t id_desc_list[4096];
+};
+
+/**
+ * State of struct spdk_nvme_ctrlr (in particular, during initialization).
+ */
+enum nvme_ctrlr_state {
+ /**
+ * Wait before initializing the controller.
+ */
+ NVME_CTRLR_STATE_INIT_DELAY,
+
+ /**
+ * Controller has not been initialized yet.
+ */
+ NVME_CTRLR_STATE_INIT,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 0 to 1 so that CC.EN may be set to 0.
+ */
+ NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_1,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 1 to 0 so that CC.EN may be set to 1.
+ */
+ NVME_CTRLR_STATE_DISABLE_WAIT_FOR_READY_0,
+
+ /**
+ * Enable the controller by writing CC.EN to 1
+ */
+ NVME_CTRLR_STATE_ENABLE,
+
+ /**
+ * Waiting for CSTS.RDY to transition from 0 to 1 after enabling the controller.
+ */
+ NVME_CTRLR_STATE_ENABLE_WAIT_FOR_READY_1,
+
+ /**
+ * Enable the Admin queue of the controller.
+ */
+ NVME_CTRLR_STATE_ENABLE_ADMIN_QUEUE,
+
+ /**
+ * Identify Controller command will be sent to then controller.
+ */
+ NVME_CTRLR_STATE_IDENTIFY,
+
+ /**
+ * Waiting for Identify Controller command be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY,
+
+ /**
+ * Set Number of Queues of the controller.
+ */
+ NVME_CTRLR_STATE_SET_NUM_QUEUES,
+
+ /**
+ * Waiting for Set Num of Queues command to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_SET_NUM_QUEUES,
+
+ /**
+ * Get Number of Queues of the controller.
+ */
+ NVME_CTRLR_STATE_GET_NUM_QUEUES,
+
+ /**
+ * Waiting for Get Num of Queues command to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_GET_NUM_QUEUES,
+
+ /**
+ * Construct Namespace data structures of the controller.
+ */
+ NVME_CTRLR_STATE_CONSTRUCT_NS,
+
+ /**
+ * Get active Namespace list of the controller.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_ACTIVE_NS,
+
+ /**
+ * Get Identify Namespace Data structure for each NS.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_NS,
+
+ /**
+ * Waiting for the Identify Namespace commands to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_NS,
+
+ /**
+ * Get Identify Namespace Identification Descriptors.
+ */
+ NVME_CTRLR_STATE_IDENTIFY_ID_DESCS,
+
+ /**
+ * Waiting for the Identify Namespace Identification
+ * Descriptors to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_IDENTIFY_ID_DESCS,
+
+ /**
+ * Configure AER of the controller.
+ */
+ NVME_CTRLR_STATE_CONFIGURE_AER,
+
+ /**
+ * Waiting for the Configure AER to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_CONFIGURE_AER,
+
+ /**
+ * Set supported log pages of the controller.
+ */
+ NVME_CTRLR_STATE_SET_SUPPORTED_LOG_PAGES,
+
+ /**
+ * Set supported features of the controller.
+ */
+ NVME_CTRLR_STATE_SET_SUPPORTED_FEATURES,
+
+ /**
+ * Set Doorbell Buffer Config of the controller.
+ */
+ NVME_CTRLR_STATE_SET_DB_BUF_CFG,
+
+ /**
+ * Waiting for Doorbell Buffer Config to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_DB_BUF_CFG,
+
+ /**
+ * Set Keep Alive Timeout of the controller.
+ */
+ NVME_CTRLR_STATE_SET_KEEP_ALIVE_TIMEOUT,
+
+ /**
+ * Waiting for Set Keep Alive Timeout to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_KEEP_ALIVE_TIMEOUT,
+
+ /**
+ * Set Host ID of the controller.
+ */
+ NVME_CTRLR_STATE_SET_HOST_ID,
+
+ /**
+ * Waiting for Set Host ID to be completed.
+ */
+ NVME_CTRLR_STATE_WAIT_FOR_HOST_ID,
+
+ /**
+ * Controller initialization has completed and the controller is ready.
+ */
+ NVME_CTRLR_STATE_READY,
+
+ /**
+ * Controller inilialization has an error.
+ */
+ NVME_CTRLR_STATE_ERROR
+};
+
+#define NVME_TIMEOUT_INFINITE UINT64_MAX
+
+/*
+ * Used to track properties for all processes accessing the controller.
+ */
+struct spdk_nvme_ctrlr_process {
+ /** Whether it is the primary process */
+ bool is_primary;
+
+ /** Process ID */
+ pid_t pid;
+
+ /** Active admin requests to be completed */
+ STAILQ_HEAD(, nvme_request) active_reqs;
+
+ TAILQ_ENTRY(spdk_nvme_ctrlr_process) tailq;
+
+ /** Per process PCI device handle */
+ struct spdk_pci_device *devhandle;
+
+ /** Reference to track the number of attachment to this controller. */
+ int ref;
+
+ /** Allocated IO qpairs */
+ TAILQ_HEAD(, spdk_nvme_qpair) allocated_io_qpairs;
+
+ spdk_nvme_aer_cb aer_cb_fn;
+ void *aer_cb_arg;
+
+ /**
+ * A function pointer to timeout callback function
+ */
+ spdk_nvme_timeout_cb timeout_cb_fn;
+ void *timeout_cb_arg;
+ uint64_t timeout_ticks;
+};
+
+/*
+ * One of these per allocated PCI device.
+ */
+struct spdk_nvme_ctrlr {
+ /* Hot data (accessed in I/O path) starts here. */
+
+ /** Array of namespaces indexed by nsid - 1 */
+ struct spdk_nvme_ns *ns;
+
+ struct spdk_nvme_transport_id trid;
+
+ uint32_t num_ns;
+
+ bool is_removed;
+
+ bool is_resetting;
+
+ bool is_failed;
+
+ bool timeout_enabled;
+
+ uint16_t max_sges;
+
+ uint16_t cntlid;
+
+ /** Controller support flags */
+ uint64_t flags;
+
+ /* Cold data (not accessed in normal I/O path) is after this point. */
+
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+
+ enum nvme_ctrlr_state state;
+ uint64_t state_timeout_tsc;
+
+ uint64_t next_keep_alive_tick;
+ uint64_t keep_alive_interval_ticks;
+
+ TAILQ_ENTRY(spdk_nvme_ctrlr) tailq;
+
+ /** All the log pages supported */
+ bool log_page_supported[256];
+
+ /** All the features supported */
+ bool feature_supported[256];
+
+ /** maximum i/o size in bytes */
+ uint32_t max_xfer_size;
+
+ /** minimum page size supported by this controller in bytes */
+ uint32_t min_page_size;
+
+ /** selected memory page size for this controller in bytes */
+ uint32_t page_size;
+
+ uint32_t num_aers;
+ struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS];
+
+ /** guards access to the controller itself, including admin queues */
+ pthread_mutex_t ctrlr_lock;
+
+
+ struct spdk_nvme_qpair *adminq;
+
+ /** shadow doorbell buffer */
+ uint32_t *shadow_doorbell;
+ /** eventidx buffer */
+ uint32_t *eventidx;
+
+ /**
+ * Identify Controller data.
+ */
+ struct spdk_nvme_ctrlr_data cdata;
+
+ /**
+ * Keep track of active namespaces
+ */
+ uint32_t *active_ns_list;
+
+ /**
+ * Array of Identify Namespace data.
+ *
+ * Stored separately from ns since nsdata should not normally be accessed during I/O.
+ */
+ struct spdk_nvme_ns_data *nsdata;
+
+ struct spdk_bit_array *free_io_qids;
+ TAILQ_HEAD(, spdk_nvme_qpair) active_io_qpairs;
+
+ struct spdk_nvme_ctrlr_opts opts;
+
+ uint64_t quirks;
+
+ /* Extra sleep time during controller initialization */
+ uint64_t sleep_timeout_tsc;
+
+ /** Track all the processes manage this controller */
+ TAILQ_HEAD(, spdk_nvme_ctrlr_process) active_procs;
+
+
+ STAILQ_HEAD(, nvme_request) queued_aborts;
+ uint32_t outstanding_aborts;
+};
+
+struct nvme_driver {
+ pthread_mutex_t lock;
+
+ /** Multi-process shared attached controller list */
+ TAILQ_HEAD(, spdk_nvme_ctrlr) shared_attached_ctrlrs;
+
+ bool initialized;
+ struct spdk_uuid default_extended_host_id;
+};
+
+extern struct nvme_driver *g_spdk_nvme_driver;
+
+int nvme_driver_init(void);
+
+/*
+ * Used for the spdk_nvme_connect() public API to save user specified opts.
+ */
+struct spdk_nvme_ctrlr_connect_opts {
+ const struct spdk_nvme_ctrlr_opts *opts;
+ size_t opts_size;
+};
+
+#define nvme_delay usleep
+
+static inline bool
+nvme_qpair_is_admin_queue(struct spdk_nvme_qpair *qpair)
+{
+ return qpair->id == 0;
+}
+
+static inline bool
+nvme_qpair_is_io_queue(struct spdk_nvme_qpair *qpair)
+{
+ return qpair->id != 0;
+}
+
+static inline int
+nvme_robust_mutex_lock(pthread_mutex_t *mtx)
+{
+ int rc = pthread_mutex_lock(mtx);
+
+#ifndef __FreeBSD__
+ if (rc == EOWNERDEAD) {
+ rc = pthread_mutex_consistent(mtx);
+ }
+#endif
+
+ return rc;
+}
+
+static inline int
+nvme_robust_mutex_unlock(pthread_mutex_t *mtx)
+{
+ return pthread_mutex_unlock(mtx);
+}
+
+/* Admin functions */
+int nvme_ctrlr_cmd_identify(struct spdk_nvme_ctrlr *ctrlr,
+ uint8_t cns, uint16_t cntid, uint32_t nsid,
+ void *payload, size_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t num_queues, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg);
+int nvme_ctrlr_cmd_get_num_queues(struct spdk_nvme_ctrlr *ctrlr,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_async_event_config(struct spdk_nvme_ctrlr *ctrlr,
+ union spdk_nvme_feat_async_event_configuration config,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_set_host_id(struct spdk_nvme_ctrlr *ctrlr, void *host_id, uint32_t host_id_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_attach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_detach_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_ctrlr_list *payload, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_create_ns(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_ns_data *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_doorbell_buffer_config(struct spdk_nvme_ctrlr *ctrlr,
+ uint64_t prp1, uint64_t prp2,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_delete_ns(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg);
+int nvme_ctrlr_cmd_format(struct spdk_nvme_ctrlr *ctrlr, uint32_t nsid,
+ struct spdk_nvme_format *format, spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_fw_commit(struct spdk_nvme_ctrlr *ctrlr,
+ const struct spdk_nvme_fw_commit *fw_commit,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+int nvme_ctrlr_cmd_fw_image_download(struct spdk_nvme_ctrlr *ctrlr,
+ uint32_t size, uint32_t offset, void *payload,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg);
+void nvme_completion_poll_cb(void *arg, const struct spdk_nvme_cpl *cpl);
+int spdk_nvme_wait_for_completion(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status);
+int spdk_nvme_wait_for_completion_robust_lock(struct spdk_nvme_qpair *qpair,
+ struct nvme_completion_poll_status *status,
+ pthread_mutex_t *robust_mutex);
+
+struct spdk_nvme_ctrlr_process *spdk_nvme_ctrlr_get_process(struct spdk_nvme_ctrlr *ctrlr,
+ pid_t pid);
+struct spdk_nvme_ctrlr_process *spdk_nvme_ctrlr_get_current_process(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_ctrlr_add_process(struct spdk_nvme_ctrlr *ctrlr, void *devhandle);
+void nvme_ctrlr_free_processes(struct spdk_nvme_ctrlr *ctrlr);
+struct spdk_pci_device *nvme_ctrlr_proc_get_devhandle(struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_ctrlr_probe(const struct spdk_nvme_transport_id *trid, void *devhandle,
+ spdk_nvme_probe_cb probe_cb, void *cb_ctx);
+
+int nvme_ctrlr_construct(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_destruct_finish(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_fail(struct spdk_nvme_ctrlr *ctrlr, bool hot_remove);
+int nvme_ctrlr_process_init(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_connected(struct spdk_nvme_ctrlr *ctrlr);
+
+int nvme_ctrlr_submit_admin_request(struct spdk_nvme_ctrlr *ctrlr,
+ struct nvme_request *req);
+int nvme_ctrlr_get_cap(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_cap_register *cap);
+int nvme_ctrlr_get_vs(struct spdk_nvme_ctrlr *ctrlr, union spdk_nvme_vs_register *vs);
+void nvme_ctrlr_init_cap(struct spdk_nvme_ctrlr *ctrlr, const union spdk_nvme_cap_register *cap,
+ const union spdk_nvme_vs_register *vs);
+int nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+ struct spdk_nvme_ctrlr *ctrlr,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests);
+void nvme_qpair_deinit(struct spdk_nvme_qpair *qpair);
+void nvme_qpair_enable(struct spdk_nvme_qpair *qpair);
+void nvme_qpair_disable(struct spdk_nvme_qpair *qpair);
+int nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req);
+
+int nvme_ctrlr_identify_active_ns(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ns_set_identify_data(struct spdk_nvme_ns *ns);
+int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+ struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ns_destruct(struct spdk_nvme_ns *ns);
+
+int nvme_fabric_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value);
+int nvme_fabric_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value);
+int nvme_fabric_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value);
+int nvme_fabric_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value);
+int nvme_fabric_ctrlr_discover(struct spdk_nvme_ctrlr *ctrlr, void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb);
+int nvme_fabric_qpair_connect(struct spdk_nvme_qpair *qpair, uint32_t num_entries);
+
+static inline struct nvme_request *
+nvme_allocate_request(struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+
+ req = STAILQ_FIRST(&qpair->free_req);
+ if (req == NULL) {
+ return req;
+ }
+
+ STAILQ_REMOVE_HEAD(&qpair->free_req, stailq);
+
+ /*
+ * Only memset/zero fields that need it. All other fields
+ * will be initialized appropriately either later in this
+ * function, or before they are needed later in the
+ * submission patch. For example, the children
+ * TAILQ_ENTRY and following members are
+ * only used as part of I/O splitting so we avoid
+ * memsetting them until it is actually needed.
+ * They will be initialized in nvme_request_add_child()
+ * if the request is split.
+ */
+ memset(req, 0, offsetof(struct nvme_request, payload_size));
+
+ req->cb_fn = cb_fn;
+ req->cb_arg = cb_arg;
+ req->payload = *payload;
+ req->payload_size = payload_size;
+ req->qpair = qpair;
+ req->pid = g_spdk_nvme_pid;
+
+ return req;
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_contig(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ return nvme_allocate_request(qpair, &payload, payload_size, cb_fn, cb_arg);
+}
+
+static inline struct nvme_request *
+nvme_allocate_request_null(struct spdk_nvme_qpair *qpair, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ return nvme_allocate_request_contig(qpair, NULL, 0, cb_fn, cb_arg);
+}
+
+struct nvme_request *nvme_allocate_request_user_copy(struct spdk_nvme_qpair *qpair,
+ void *buffer, uint32_t payload_size,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, bool host_to_controller);
+
+static inline void
+nvme_complete_request(struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_qpair *qpair = req->qpair;
+ struct spdk_nvme_cpl err_cpl;
+ struct nvme_error_cmd *cmd;
+
+ /* error injection at completion path,
+ * only inject for successful completed commands
+ */
+ if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head) &&
+ !spdk_nvme_cpl_is_error(cpl))) {
+ TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+
+ if (cmd->do_not_submit) {
+ continue;
+ }
+
+ if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+
+ err_cpl = *cpl;
+ err_cpl.status.sct = cmd->status.sct;
+ err_cpl.status.sc = cmd->status.sc;
+
+ cpl = &err_cpl;
+ cmd->err_count--;
+ break;
+ }
+ }
+ }
+
+ if (req->cb_fn) {
+ req->cb_fn(req->cb_arg, cpl);
+ }
+}
+
+static inline void
+nvme_free_request(struct nvme_request *req)
+{
+ assert(req != NULL);
+ assert(req->num_children == 0);
+ assert(req->qpair != NULL);
+
+ STAILQ_INSERT_HEAD(&req->qpair->free_req, req, stailq);
+}
+
+void nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child);
+int nvme_request_check_timeout(struct nvme_request *req, uint16_t cid,
+ struct spdk_nvme_ctrlr_process *active_proc, uint64_t now_tick);
+uint64_t nvme_get_quirks(const struct spdk_pci_id *id);
+
+int nvme_robust_mutex_init_shared(pthread_mutex_t *mtx);
+int nvme_robust_mutex_init_recursive_shared(pthread_mutex_t *mtx);
+
+bool nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl);
+void nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd);
+void nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cpl *cpl);
+
+struct spdk_nvme_ctrlr *spdk_nvme_get_ctrlr_by_trid_unsafe(
+ const struct spdk_nvme_transport_id *trid);
+
+/* Transport specific functions */
+#define DECLARE_TRANSPORT(name) \
+ struct spdk_nvme_ctrlr *nvme_ ## name ## _ctrlr_construct(const struct spdk_nvme_transport_id *trid, const struct spdk_nvme_ctrlr_opts *opts, \
+ void *devhandle); \
+ int nvme_ ## name ## _ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr); \
+ int nvme_ ## name ## _ctrlr_scan(const struct spdk_nvme_transport_id *trid, void *cb_ctx, spdk_nvme_probe_cb probe_cb, spdk_nvme_remove_cb remove_cb, bool direct_connect); \
+ int nvme_ ## name ## _ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr); \
+ int nvme_ ## name ## _ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value); \
+ int nvme_ ## name ## _ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value); \
+ int nvme_ ## name ## _ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value); \
+ int nvme_ ## name ## _ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value); \
+ uint32_t nvme_ ## name ## _ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr); \
+ uint16_t nvme_ ## name ## _ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr); \
+ struct spdk_nvme_qpair *nvme_ ## name ## _ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid, const struct spdk_nvme_io_qpair_opts *opts); \
+ void *nvme_ ## name ## _ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size); \
+ int nvme_ ## name ## _ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size); \
+ int nvme_ ## name ## _ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _qpair_enable(struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _qpair_disable(struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _qpair_reset(struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _qpair_fail(struct spdk_nvme_qpair *qpair); \
+ int nvme_ ## name ## _qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req); \
+ int32_t nvme_ ## name ## _qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions);
+
+DECLARE_TRANSPORT(transport) /* generic transport dispatch functions */
+DECLARE_TRANSPORT(pcie)
+#ifdef SPDK_CONFIG_RDMA
+DECLARE_TRANSPORT(rdma)
+#endif
+
+#undef DECLARE_TRANSPORT
+
+/*
+ * Below ref related functions must be called with the global
+ * driver lock held for the multi-process condition.
+ * Within these functions, the per ctrlr ctrlr_lock is also
+ * acquired for the multi-thread condition.
+ */
+void nvme_ctrlr_proc_get_ref(struct spdk_nvme_ctrlr *ctrlr);
+void nvme_ctrlr_proc_put_ref(struct spdk_nvme_ctrlr *ctrlr);
+int nvme_ctrlr_get_ref_count(struct spdk_nvme_ctrlr *ctrlr);
+
+static inline bool
+_is_page_aligned(uint64_t address, uint64_t page_size)
+{
+ return (address & (page_size - 1)) == 0;
+}
+
+#endif /* __NVME_INTERNAL_H__ */
diff --git a/src/spdk/lib/nvme/nvme_ns.c b/src/spdk/lib/nvme/nvme_ns.c
new file mode 100644
index 00000000..b88bf174
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns.c
@@ -0,0 +1,360 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static inline struct spdk_nvme_ns_data *
+_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+ return &ns->ctrlr->nsdata[ns->id - 1];
+}
+
+/**
+ * Update Namespace flags based on Identify Controller
+ * and Identify Namespace. This can be also used for
+ * Namespace Attribute Notice events and Namespace
+ * operations such as Attach/Detach.
+ */
+void
+nvme_ns_set_identify_data(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ns_data *nsdata;
+
+ nsdata = _nvme_ns_get_data(ns);
+
+ ns->flags = 0x0000;
+
+ ns->sector_size = 1 << nsdata->lbaf[nsdata->flbas.format].lbads;
+ ns->extended_lba_size = ns->sector_size;
+
+ ns->md_size = nsdata->lbaf[nsdata->flbas.format].ms;
+ if (nsdata->flbas.extended) {
+ ns->flags |= SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED;
+ ns->extended_lba_size += ns->md_size;
+ }
+
+ ns->sectors_per_max_io = spdk_nvme_ns_get_max_io_xfer_size(ns) / ns->extended_lba_size;
+
+ if (nsdata->noiob) {
+ ns->sectors_per_stripe = nsdata->noiob;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u optimal IO boundary %" PRIu32 " blocks\n",
+ ns->id, ns->sectors_per_stripe);
+ } else if (ns->ctrlr->quirks & NVME_INTEL_QUIRK_STRIPING &&
+ ns->ctrlr->cdata.vs[3] != 0) {
+ ns->sectors_per_stripe = (1ULL << ns->ctrlr->cdata.vs[3]) * ns->ctrlr->min_page_size /
+ ns->sector_size;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "ns %u stripe size quirk %" PRIu32 " blocks\n",
+ ns->id, ns->sectors_per_stripe);
+ } else {
+ ns->sectors_per_stripe = 0;
+ }
+
+ if (ns->ctrlr->cdata.oncs.dsm) {
+ ns->flags |= SPDK_NVME_NS_DEALLOCATE_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.vwc.present) {
+ ns->flags |= SPDK_NVME_NS_FLUSH_SUPPORTED;
+ }
+
+ if (ns->ctrlr->cdata.oncs.write_zeroes) {
+ ns->flags |= SPDK_NVME_NS_WRITE_ZEROES_SUPPORTED;
+ }
+
+ if (nsdata->nsrescap.raw) {
+ ns->flags |= SPDK_NVME_NS_RESERVATION_SUPPORTED;
+ }
+
+ ns->pi_type = SPDK_NVME_FMT_NVM_PROTECTION_DISABLE;
+ if (nsdata->lbaf[nsdata->flbas.format].ms && nsdata->dps.pit) {
+ ns->flags |= SPDK_NVME_NS_DPS_PI_SUPPORTED;
+ ns->pi_type = nsdata->dps.pit;
+ }
+}
+
+static int
+nvme_ctrlr_identify_ns(struct spdk_nvme_ns *ns)
+{
+ struct nvme_completion_poll_status status;
+ struct spdk_nvme_ns_data *nsdata;
+ int rc;
+
+ nsdata = _nvme_ns_get_data(ns);
+ rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS, 0, ns->id,
+ nsdata, sizeof(*nsdata),
+ nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, &status,
+ &ns->ctrlr->ctrlr_lock)) {
+ /* This can occur if the namespace is not active. Simply zero the
+ * namespace data and continue. */
+ nvme_ns_destruct(ns);
+ return 0;
+ }
+
+ nvme_ns_set_identify_data(ns);
+
+ return 0;
+}
+
+static int
+nvme_ctrlr_identify_id_desc(struct spdk_nvme_ns *ns)
+{
+ struct nvme_completion_poll_status status;
+ int rc;
+
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+
+ if (ns->ctrlr->vs.raw < SPDK_NVME_VERSION(1, 3, 0) ||
+ (ns->ctrlr->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Version < 1.3; not attempting to retrieve NS ID Descriptor List\n");
+ return 0;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Attempting to retrieve NS ID Descriptor List\n");
+ rc = nvme_ctrlr_cmd_identify(ns->ctrlr, SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST, 0, ns->id,
+ ns->id_desc_list, sizeof(ns->id_desc_list),
+ nvme_completion_poll_cb, &status);
+ if (rc < 0) {
+ return rc;
+ }
+
+ rc = spdk_nvme_wait_for_completion_robust_lock(ns->ctrlr->adminq, &status, &ns->ctrlr->ctrlr_lock);
+ if (rc != 0) {
+ SPDK_WARNLOG("Failed to retrieve NS ID Descriptor List\n");
+ memset(ns->id_desc_list, 0, sizeof(ns->id_desc_list));
+ }
+
+ return rc;
+}
+
+uint32_t
+spdk_nvme_ns_get_id(struct spdk_nvme_ns *ns)
+{
+ return ns->id;
+}
+
+bool
+spdk_nvme_ns_is_active(struct spdk_nvme_ns *ns)
+{
+ const struct spdk_nvme_ns_data *nsdata = NULL;
+
+ /*
+ * According to the spec, valid NS has non-zero id.
+ */
+ if (ns->id == 0) {
+ return false;
+ }
+
+ nsdata = _nvme_ns_get_data(ns);
+
+ /*
+ * According to the spec, Identify Namespace will return a zero-filled structure for
+ * inactive namespace IDs.
+ * Check NCAP since it must be nonzero for an active namespace.
+ */
+ return nsdata->ncap != 0;
+}
+
+struct spdk_nvme_ctrlr *
+spdk_nvme_ns_get_ctrlr(struct spdk_nvme_ns *ns)
+{
+ return ns->ctrlr;
+}
+
+uint32_t
+spdk_nvme_ns_get_max_io_xfer_size(struct spdk_nvme_ns *ns)
+{
+ return ns->ctrlr->max_xfer_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_sector_size(struct spdk_nvme_ns *ns)
+{
+ return ns->sector_size;
+}
+
+uint32_t
+spdk_nvme_ns_get_extended_sector_size(struct spdk_nvme_ns *ns)
+{
+ return ns->extended_lba_size;
+}
+
+uint64_t
+spdk_nvme_ns_get_num_sectors(struct spdk_nvme_ns *ns)
+{
+ return _nvme_ns_get_data(ns)->nsze;
+}
+
+uint64_t
+spdk_nvme_ns_get_size(struct spdk_nvme_ns *ns)
+{
+ return spdk_nvme_ns_get_num_sectors(ns) * spdk_nvme_ns_get_sector_size(ns);
+}
+
+uint32_t
+spdk_nvme_ns_get_flags(struct spdk_nvme_ns *ns)
+{
+ return ns->flags;
+}
+
+enum spdk_nvme_pi_type
+spdk_nvme_ns_get_pi_type(struct spdk_nvme_ns *ns) {
+ return ns->pi_type;
+}
+
+bool
+spdk_nvme_ns_supports_extended_lba(struct spdk_nvme_ns *ns)
+{
+ return (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) ? true : false;
+}
+
+uint32_t
+spdk_nvme_ns_get_md_size(struct spdk_nvme_ns *ns)
+{
+ return ns->md_size;
+}
+
+const struct spdk_nvme_ns_data *
+spdk_nvme_ns_get_data(struct spdk_nvme_ns *ns)
+{
+ return _nvme_ns_get_data(ns);
+}
+
+enum spdk_nvme_dealloc_logical_block_read_value spdk_nvme_ns_get_dealloc_logical_block_read_value(
+ struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ctrlr *ctrlr = ns->ctrlr;
+ const struct spdk_nvme_ns_data *data = spdk_nvme_ns_get_data(ns);
+
+ if (ctrlr->quirks & NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE) {
+ return SPDK_NVME_DEALLOC_READ_00;
+ } else {
+ return data->dlfeat.bits.read_value;
+ }
+}
+
+uint32_t
+spdk_nvme_ns_get_optimal_io_boundary(struct spdk_nvme_ns *ns)
+{
+ return ns->sectors_per_stripe;
+}
+
+static const void *
+_spdk_nvme_ns_find_id_desc(const struct spdk_nvme_ns *ns, enum spdk_nvme_nidt type, size_t *length)
+{
+ const struct spdk_nvme_ns_id_desc *desc;
+ size_t offset;
+
+ offset = 0;
+ while (offset + 4 < sizeof(ns->id_desc_list)) {
+ desc = (const struct spdk_nvme_ns_id_desc *)&ns->id_desc_list[offset];
+
+ if (desc->nidl == 0) {
+ /* End of list */
+ return NULL;
+ }
+
+ /*
+ * Check if this descriptor fits within the list.
+ * 4 is the fixed-size descriptor header (not counted in NIDL).
+ */
+ if (offset + desc->nidl + 4 > sizeof(ns->id_desc_list)) {
+ /* Descriptor longer than remaining space in list (invalid) */
+ return NULL;
+ }
+
+ if (desc->nidt == type) {
+ *length = desc->nidl;
+ return &desc->nid[0];
+ }
+
+ offset += 4 + desc->nidl;
+ }
+
+ return NULL;
+}
+
+const struct spdk_uuid *
+spdk_nvme_ns_get_uuid(const struct spdk_nvme_ns *ns)
+{
+ const struct spdk_uuid *uuid;
+ size_t uuid_size;
+
+ uuid = _spdk_nvme_ns_find_id_desc(ns, SPDK_NVME_NIDT_UUID, &uuid_size);
+ if (uuid == NULL || uuid_size != sizeof(*uuid)) {
+ return NULL;
+ }
+
+ return uuid;
+}
+
+int nvme_ns_construct(struct spdk_nvme_ns *ns, uint32_t id,
+ struct spdk_nvme_ctrlr *ctrlr)
+{
+ int rc;
+
+ assert(id > 0);
+
+ ns->ctrlr = ctrlr;
+ ns->id = id;
+
+ rc = nvme_ctrlr_identify_ns(ns);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return nvme_ctrlr_identify_id_desc(ns);
+}
+
+void nvme_ns_destruct(struct spdk_nvme_ns *ns)
+{
+ struct spdk_nvme_ns_data *nsdata;
+
+ if (!ns->id) {
+ return;
+ }
+
+ nsdata = _nvme_ns_get_data(ns);
+ memset(nsdata, 0, sizeof(*nsdata));
+ ns->sector_size = 0;
+ ns->extended_lba_size = 0;
+ ns->md_size = 0;
+ ns->pi_type = 0;
+ ns->sectors_per_max_io = 0;
+ ns->sectors_per_stripe = 0;
+ ns->flags = 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_cmd.c b/src/spdk/lib/nvme/nvme_ns_cmd.c
new file mode 100644
index 00000000..9562cf5a
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_cmd.c
@@ -0,0 +1,1026 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+static struct nvme_request *_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg, uint32_t opc, uint32_t io_flags,
+ uint16_t apptag_mask, uint16_t apptag, bool check_sgl);
+
+
+static bool
+spdk_nvme_ns_check_request_length(uint32_t lba_count, uint32_t sectors_per_max_io,
+ uint32_t sectors_per_stripe, uint32_t qdepth)
+{
+ uint32_t child_per_io;
+
+ if (sectors_per_stripe > 0) {
+ child_per_io = (lba_count + sectors_per_stripe - 1) / sectors_per_stripe;
+ } else {
+ child_per_io = (lba_count + sectors_per_max_io - 1) / sectors_per_max_io;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "checking maximum i/o length %d\n", child_per_io);
+
+ return child_per_io >= qdepth;
+}
+
+static void
+nvme_cb_complete_child(void *child_arg, const struct spdk_nvme_cpl *cpl)
+{
+ struct nvme_request *child = child_arg;
+ struct nvme_request *parent = child->parent;
+
+ nvme_request_remove_child(parent, child);
+
+ if (spdk_nvme_cpl_is_error(cpl)) {
+ memcpy(&parent->parent_status, cpl, sizeof(*cpl));
+ }
+
+ if (parent->num_children == 0) {
+ nvme_complete_request(parent, &parent->parent_status);
+ nvme_free_request(parent);
+ }
+}
+
+static void
+nvme_request_add_child(struct nvme_request *parent, struct nvme_request *child)
+{
+ assert(parent->num_children != UINT16_MAX);
+
+ if (parent->num_children == 0) {
+ /*
+ * Defer initialization of the children TAILQ since it falls
+ * on a separate cacheline. This ensures we do not touch this
+ * cacheline except on request splitting cases, which are
+ * relatively rare.
+ */
+ TAILQ_INIT(&parent->children);
+ parent->parent = NULL;
+ memset(&parent->parent_status, 0, sizeof(struct spdk_nvme_cpl));
+ }
+
+ parent->num_children++;
+ TAILQ_INSERT_TAIL(&parent->children, child, child_tailq);
+ child->parent = parent;
+ child->cb_fn = nvme_cb_complete_child;
+ child->cb_arg = child;
+}
+
+void
+nvme_request_remove_child(struct nvme_request *parent, struct nvme_request *child)
+{
+ assert(parent != NULL);
+ assert(child != NULL);
+ assert(child->parent == parent);
+ assert(parent->num_children != 0);
+
+ parent->num_children--;
+ TAILQ_REMOVE(&parent->children, child, child_tailq);
+}
+
+static void
+nvme_request_free_children(struct nvme_request *req)
+{
+ struct nvme_request *child, *tmp;
+
+ if (req->num_children == 0) {
+ return;
+ }
+
+ /* free all child nvme_request */
+ TAILQ_FOREACH_SAFE(child, &req->children, child_tailq, tmp) {
+ nvme_request_remove_child(req, child);
+ nvme_request_free_children(child);
+ nvme_free_request(child);
+ }
+}
+
+static struct nvme_request *
+_nvme_add_child_request(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag,
+ struct nvme_request *parent, bool check_sgl)
+{
+ struct nvme_request *child;
+
+ child = _nvme_ns_cmd_rw(ns, qpair, payload, payload_offset, md_offset, lba, lba_count, cb_fn,
+ cb_arg, opc, io_flags, apptag_mask, apptag, check_sgl);
+ if (child == NULL) {
+ nvme_request_free_children(parent);
+ nvme_free_request(parent);
+ return NULL;
+ }
+
+ nvme_request_add_child(parent, child);
+ return child;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint32_t sectors_per_max_io, uint32_t sector_mask,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ uint32_t sector_size;
+ uint32_t md_size = ns->md_size;
+ uint32_t remaining_lba_count = lba_count;
+ struct nvme_request *child;
+
+ sector_size = ns->extended_lba_size;
+
+ if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+ (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+ (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+ (md_size == 8)) {
+ sector_size -= 8;
+ }
+
+ while (remaining_lba_count > 0) {
+ lba_count = sectors_per_max_io - (lba & sector_mask);
+ lba_count = spdk_min(remaining_lba_count, lba_count);
+
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc,
+ io_flags, apptag_mask, apptag, req, true);
+ if (child == NULL) {
+ return NULL;
+ }
+
+ remaining_lba_count -= lba_count;
+ lba += lba_count;
+ payload_offset += lba_count * sector_size;
+ md_offset += lba_count * md_size;
+ }
+
+ return req;
+}
+
+static void
+_nvme_ns_cmd_setup_request(struct spdk_nvme_ns *ns, struct nvme_request *req,
+ uint32_t opc, uint64_t lba, uint32_t lba_count,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct spdk_nvme_cmd *cmd;
+
+ cmd = &req->cmd;
+ cmd->opc = opc;
+ cmd->nsid = ns->id;
+
+ *(uint64_t *)&cmd->cdw10 = lba;
+
+ if (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) {
+ switch (ns->pi_type) {
+ case SPDK_NVME_FMT_NVM_PROTECTION_TYPE1:
+ case SPDK_NVME_FMT_NVM_PROTECTION_TYPE2:
+ cmd->cdw14 = (uint32_t)lba;
+ break;
+ }
+ }
+
+ cmd->cdw12 = lba_count - 1;
+ cmd->cdw12 |= io_flags;
+
+ cmd->cdw15 = apptag_mask;
+ cmd->cdw15 = (cmd->cdw15 << 16 | apptag);
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_prp(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+ void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+ bool start_valid, end_valid, last_sge, child_equals_parent;
+ uint64_t child_lba = lba;
+ uint32_t req_current_length = 0;
+ uint32_t child_length = 0;
+ uint32_t sge_length;
+ uint32_t page_size = qpair->ctrlr->page_size;
+ uintptr_t address;
+
+ reset_sgl_fn(sgl_cb_arg, payload_offset);
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+ while (req_current_length < req->payload_size) {
+
+ if (sge_length == 0) {
+ continue;
+ } else if (req_current_length + sge_length > req->payload_size) {
+ sge_length = req->payload_size - req_current_length;
+ }
+
+ /*
+ * The start of the SGE is invalid if the start address is not page aligned,
+ * unless it is the first SGE in the child request.
+ */
+ start_valid = child_length == 0 || _is_page_aligned(address, page_size);
+
+ /* Boolean for whether this is the last SGE in the parent request. */
+ last_sge = (req_current_length + sge_length == req->payload_size);
+
+ /*
+ * The end of the SGE is invalid if the end address is not page aligned,
+ * unless it is the last SGE in the parent request.
+ */
+ end_valid = last_sge || _is_page_aligned(address + sge_length, page_size);
+
+ /*
+ * This child request equals the parent request, meaning that no splitting
+ * was required for the parent request (the one passed into this function).
+ * In this case, we do not create a child request at all - we just send
+ * the original request as a single request at the end of this function.
+ */
+ child_equals_parent = (child_length + sge_length == req->payload_size);
+
+ if (start_valid) {
+ /*
+ * The start of the SGE is valid, so advance the length parameters,
+ * to include this SGE with previous SGEs for this child request
+ * (if any). If it is not valid, we do not advance the length
+ * parameters nor get the next SGE, because we must send what has
+ * been collected before this SGE as a child request.
+ */
+ child_length += sge_length;
+ req_current_length += sge_length;
+ if (req_current_length < req->payload_size) {
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+ }
+ /*
+ * If the next SGE is not page aligned, we will need to create a child
+ * request for what we have so far, and then start a new child request for
+ * the next SGE.
+ */
+ start_valid = _is_page_aligned(address, page_size);
+ }
+
+ if (start_valid && end_valid && !last_sge) {
+ continue;
+ }
+
+ /*
+ * We need to create a split here. Send what we have accumulated so far as a child
+ * request. Checking if child_equals_parent allows us to *not* create a child request
+ * when no splitting is required - in that case we will fall-through and just create
+ * a single request with no children for the entire I/O.
+ */
+ if (!child_equals_parent) {
+ struct nvme_request *child;
+ uint32_t child_lba_count;
+
+ if ((child_length % ns->extended_lba_size) != 0) {
+ SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+ child_length, ns->extended_lba_size);
+ return NULL;
+ }
+ child_lba_count = child_length / ns->extended_lba_size;
+ /*
+ * Note the last parameter is set to "false" - this tells the recursive
+ * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+ * since we have already verified it here.
+ */
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ child_lba, child_lba_count,
+ cb_fn, cb_arg, opc, io_flags,
+ apptag_mask, apptag, req, false);
+ if (child == NULL) {
+ return NULL;
+ }
+ payload_offset += child_length;
+ md_offset += child_lba_count * ns->md_size;
+ child_lba += child_lba_count;
+ child_length = 0;
+ }
+ }
+
+ if (child_length == req->payload_size) {
+ /* No splitting was required, so setup the whole payload as one request. */
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ }
+
+ return req;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_split_request_sgl(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload,
+ uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, struct nvme_request *req,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn = req->payload.reset_sgl_fn;
+ spdk_nvme_req_next_sge_cb next_sge_fn = req->payload.next_sge_fn;
+ void *sgl_cb_arg = req->payload.contig_or_cb_arg;
+ uint64_t child_lba = lba;
+ uint32_t req_current_length = 0;
+ uint32_t child_length = 0;
+ uint32_t sge_length;
+ uint16_t max_sges, num_sges;
+ uintptr_t address;
+
+ max_sges = ns->ctrlr->max_sges;
+
+ reset_sgl_fn(sgl_cb_arg, payload_offset);
+ num_sges = 0;
+
+ while (req_current_length < req->payload_size) {
+ next_sge_fn(sgl_cb_arg, (void **)&address, &sge_length);
+
+ if (req_current_length + sge_length > req->payload_size) {
+ sge_length = req->payload_size - req_current_length;
+ }
+
+ child_length += sge_length;
+ req_current_length += sge_length;
+ num_sges++;
+
+ if (num_sges < max_sges) {
+ continue;
+ }
+
+ /*
+ * We need to create a split here. Send what we have accumulated so far as a child
+ * request. Checking if the child equals the full payload allows us to *not*
+ * create a child request when no splitting is required - in that case we will
+ * fall-through and just create a single request with no children for the entire I/O.
+ */
+ if (child_length != req->payload_size) {
+ struct nvme_request *child;
+ uint32_t child_lba_count;
+
+ if ((child_length % ns->extended_lba_size) != 0) {
+ SPDK_ERRLOG("child_length %u not even multiple of lba_size %u\n",
+ child_length, ns->extended_lba_size);
+ return NULL;
+ }
+ child_lba_count = child_length / ns->extended_lba_size;
+ /*
+ * Note the last parameter is set to "false" - this tells the recursive
+ * call to _nvme_ns_cmd_rw() to not bother with checking for SGL splitting
+ * since we have already verified it here.
+ */
+ child = _nvme_add_child_request(ns, qpair, payload, payload_offset, md_offset,
+ child_lba, child_lba_count,
+ cb_fn, cb_arg, opc, io_flags,
+ apptag_mask, apptag, req, false);
+ if (child == NULL) {
+ return NULL;
+ }
+ payload_offset += child_length;
+ md_offset += child_lba_count * ns->md_size;
+ child_lba += child_lba_count;
+ child_length = 0;
+ num_sges = 0;
+ }
+ }
+
+ if (child_length == req->payload_size) {
+ /* No splitting was required, so setup the whole payload as one request. */
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ }
+
+ return req;
+}
+
+static struct nvme_request *
+_nvme_ns_cmd_rw(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ const struct nvme_payload *payload, uint32_t payload_offset, uint32_t md_offset,
+ uint64_t lba, uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t opc,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag, bool check_sgl)
+{
+ struct nvme_request *req;
+ uint32_t sector_size;
+ uint32_t sectors_per_max_io;
+ uint32_t sectors_per_stripe;
+
+ if (io_flags & 0xFFFF) {
+ /* The bottom 16 bits must be empty */
+ SPDK_ERRLOG("io_flags 0x%x bottom 16 bits is not empty\n",
+ io_flags);
+ return NULL;
+ }
+
+ sector_size = ns->extended_lba_size;
+ sectors_per_max_io = ns->sectors_per_max_io;
+ sectors_per_stripe = ns->sectors_per_stripe;
+
+ if ((io_flags & SPDK_NVME_IO_FLAGS_PRACT) &&
+ (ns->flags & SPDK_NVME_NS_EXTENDED_LBA_SUPPORTED) &&
+ (ns->flags & SPDK_NVME_NS_DPS_PI_SUPPORTED) &&
+ (ns->md_size == 8)) {
+ sector_size -= 8;
+ }
+
+ req = nvme_allocate_request(qpair, payload, lba_count * sector_size, cb_fn, cb_arg);
+ if (req == NULL) {
+ return NULL;
+ }
+
+ req->payload_offset = payload_offset;
+ req->md_offset = md_offset;
+
+ /*
+ * Intel DC P3*00 NVMe controllers benefit from driver-assisted striping.
+ * If this controller defines a stripe boundary and this I/O spans a stripe
+ * boundary, split the request into multiple requests and submit each
+ * separately to hardware.
+ */
+ if (sectors_per_stripe > 0 &&
+ (((lba & (sectors_per_stripe - 1)) + lba_count) > sectors_per_stripe)) {
+
+ return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+ cb_fn,
+ cb_arg, opc,
+ io_flags, req, sectors_per_stripe, sectors_per_stripe - 1, apptag_mask, apptag);
+ } else if (lba_count > sectors_per_max_io) {
+ return _nvme_ns_cmd_split_request(ns, qpair, payload, payload_offset, md_offset, lba, lba_count,
+ cb_fn,
+ cb_arg, opc,
+ io_flags, req, sectors_per_max_io, 0, apptag_mask, apptag);
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL && check_sgl) {
+ if (ns->ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
+ return _nvme_ns_cmd_split_request_sgl(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+ req, apptag_mask, apptag);
+ } else {
+ return _nvme_ns_cmd_split_request_prp(ns, qpair, payload, payload_offset, md_offset,
+ lba, lba_count, cb_fn, cb_arg, opc, io_flags,
+ req, apptag_mask, apptag);
+ }
+ }
+
+ _nvme_ns_cmd_setup_request(ns, req, opc, lba, lba_count, io_flags, apptag_mask, apptag);
+ return req;
+}
+
+int
+spdk_nvme_ns_cmd_compare(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags, 0,
+ 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_compare_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ void *metadata,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags,
+ apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_comparev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg,
+ SPDK_NVME_OPC_COMPARE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_read(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, 0,
+ 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_read_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair, void *buffer,
+ void *metadata,
+ uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags,
+ apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_readv(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_readv_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_READ,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer, uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata, uint64_t lba,
+ uint32_t lba_count, spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags, uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_writev(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, NULL);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, 0, 0, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_writev_with_md(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg, uint32_t io_flags,
+ spdk_nvme_req_reset_sgl_cb reset_sgl_fn,
+ spdk_nvme_req_next_sge_cb next_sge_fn, void *metadata,
+ uint16_t apptag_mask, uint16_t apptag)
+{
+ struct nvme_request *req;
+ struct nvme_payload payload;
+
+ if (reset_sgl_fn == NULL || next_sge_fn == NULL) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_SGL(reset_sgl_fn, next_sge_fn, cb_arg, metadata);
+
+ req = _nvme_ns_cmd_rw(ns, qpair, &payload, 0, 0, lba, lba_count, cb_fn, cb_arg, SPDK_NVME_OPC_WRITE,
+ io_flags, apptag_mask, apptag, true);
+ if (req != NULL) {
+ return nvme_qpair_submit_request(qpair, req);
+ } else if (spdk_nvme_ns_check_request_length(lba_count,
+ ns->sectors_per_max_io,
+ ns->sectors_per_stripe,
+ qpair->ctrlr->opts.io_queue_requests)) {
+ return -EINVAL;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+int
+spdk_nvme_ns_cmd_write_zeroes(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint64_t lba, uint32_t lba_count,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ uint64_t *tmp_lba;
+
+ if (lba_count == 0 || lba_count > UINT16_MAX + 1) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_WRITE_ZEROES;
+ cmd->nsid = ns->id;
+
+ tmp_lba = (uint64_t *)&cmd->cdw10;
+ *tmp_lba = lba;
+ cmd->cdw12 = lba_count - 1;
+ cmd->cdw12 |= io_flags;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_dataset_management(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ uint32_t type,
+ const struct spdk_nvme_dsm_range *ranges, uint16_t num_ranges,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (num_ranges == 0 || num_ranges > SPDK_NVME_DATASET_MANAGEMENT_MAX_RANGES) {
+ return -EINVAL;
+ }
+
+ if (ranges == NULL) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_user_copy(qpair, (void *)ranges,
+ num_ranges * sizeof(struct spdk_nvme_dsm_range),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DATASET_MANAGEMENT;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10 = num_ranges - 1;
+ cmd->cdw11 = type;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_flush(struct spdk_nvme_ns *ns, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_FLUSH;
+ cmd->nsid = ns->id;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_register(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_register_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_register_action action,
+ enum spdk_nvme_reservation_register_cptpl cptpl,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_register_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_REGISTER;
+ cmd->nsid = ns->id;
+
+ /* Bits 0-2 */
+ cmd->cdw10 = action;
+ /* Bit 3 */
+ cmd->cdw10 |= ignore_key ? 1 << 3 : 0;
+ /* Bits 30-31 */
+ cmd->cdw10 |= (uint32_t)cptpl << 30;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_release(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_key_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_release_action action,
+ enum spdk_nvme_reservation_type type,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_key_data), cb_fn,
+ cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_RELEASE;
+ cmd->nsid = ns->id;
+
+ /* Bits 0-2 */
+ cmd->cdw10 = action;
+ /* Bit 3 */
+ cmd->cdw10 |= ignore_key ? 1 << 3 : 0;
+ /* Bits 8-15 */
+ cmd->cdw10 |= (uint32_t)type << 8;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_acquire(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_reservation_acquire_data *payload,
+ bool ignore_key,
+ enum spdk_nvme_reservation_acquire_action action,
+ enum spdk_nvme_reservation_type type,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_user_copy(qpair,
+ payload, sizeof(struct spdk_nvme_reservation_acquire_data),
+ cb_fn, cb_arg, true);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_ACQUIRE;
+ cmd->nsid = ns->id;
+
+ /* Bits 0-2 */
+ cmd->cdw10 = action;
+ /* Bit 3 */
+ cmd->cdw10 |= ignore_key ? 1 << 3 : 0;
+ /* Bits 8-15 */
+ cmd->cdw10 |= (uint32_t)type << 8;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ns_cmd_reservation_report(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *payload, uint32_t len,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ uint32_t num_dwords;
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (len % 4) {
+ return -EINVAL;
+ }
+ num_dwords = len / 4;
+
+ req = nvme_allocate_request_user_copy(qpair, payload, len, cb_fn, cb_arg, false);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_RESERVATION_REPORT;
+ cmd->nsid = ns->id;
+
+ cmd->cdw10 = num_dwords;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
new file mode 100644
index 00000000..2a574992
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_ns_ocssd_cmd.c
@@ -0,0 +1,232 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/nvme_ocssd.h"
+#include "nvme_internal.h"
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_reset(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ uint64_t *lba_list, uint32_t num_lbas,
+ struct spdk_ocssd_chunk_information_entry *chunk_info,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ if (!lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_VECTOR_RESET;
+ cmd->nsid = ns->id;
+
+ if (chunk_info != NULL) {
+ cmd->mptr = spdk_vtophys(chunk_info);
+ }
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of logical block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+static int
+_nvme_ocssd_ns_cmd_vector_rw_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ enum spdk_ocssd_io_opcode opc,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+ struct nvme_payload payload;
+ uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+ if (io_flags & ~valid_flags) {
+ return -EINVAL;
+ }
+
+ if (!buffer || !lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ payload = NVME_PAYLOAD_CONTIG(buffer, metadata);
+
+ req = nvme_allocate_request(qpair, &payload, num_lbas * ns->sector_size, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = opc;
+ cmd->nsid = ns->id;
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of logical block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(lba_list);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+ cmd->cdw12 |= io_flags;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_write(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_WRITE, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read_with_md(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer, void *metadata,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, metadata, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_read(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ void *buffer,
+ uint64_t *lba_list, uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ return _nvme_ocssd_ns_cmd_vector_rw_with_md(ns, qpair, buffer, NULL, lba_list,
+ num_lbas, cb_fn, cb_arg, SPDK_OCSSD_OPC_VECTOR_READ, io_flags);
+}
+
+int
+spdk_nvme_ocssd_ns_cmd_vector_copy(struct spdk_nvme_ns *ns,
+ struct spdk_nvme_qpair *qpair,
+ uint64_t *dst_lba_list,
+ uint64_t *src_lba_list,
+ uint32_t num_lbas,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg,
+ uint32_t io_flags)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ uint32_t valid_flags = SPDK_OCSSD_IO_FLAGS_LIMITED_RETRY;
+
+ if (io_flags & ~valid_flags) {
+ return -EINVAL;
+ }
+
+ if (!dst_lba_list || !src_lba_list || (num_lbas == 0) ||
+ (num_lbas > SPDK_NVME_OCSSD_MAX_LBAL_ENTRIES)) {
+ return -EINVAL;
+ }
+
+ req = nvme_allocate_request_null(qpair, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_OCSSD_OPC_VECTOR_COPY;
+ cmd->nsid = ns->id;
+
+ /*
+ * Dword 10 and 11 store a pointer to the list of source logical
+ * block addresses.
+ * Dword 14 and 15 store a pointer to the list of destination logical
+ * block addresses.
+ * If there is a single entry in the LBA list, the logical block
+ * address should be stored instead.
+ */
+ if (num_lbas == 1) {
+ *(uint64_t *)&cmd->cdw10 = *src_lba_list;
+ *(uint64_t *)&cmd->cdw14 = *dst_lba_list;
+ } else {
+ *(uint64_t *)&cmd->cdw10 = spdk_vtophys(src_lba_list);
+ *(uint64_t *)&cmd->cdw14 = spdk_vtophys(dst_lba_list);
+ }
+
+ cmd->cdw12 = num_lbas - 1;
+ cmd->cdw12 |= io_flags;
+
+ return nvme_qpair_submit_request(qpair, req);
+}
diff --git a/src/spdk/lib/nvme/nvme_pcie.c b/src/spdk/lib/nvme/nvme_pcie.c
new file mode 100644
index 00000000..8042380c
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_pcie.c
@@ -0,0 +1,2142 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * Copyright (c) 2017, IBM Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over PCIe transport
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/env.h"
+#include "spdk/likely.h"
+#include "nvme_internal.h"
+#include "nvme_uevent.h"
+
+/*
+ * Number of completion queue entries to process before ringing the
+ * completion queue doorbell.
+ */
+#define NVME_MIN_COMPLETIONS (1)
+#define NVME_MAX_COMPLETIONS (128)
+
+#define NVME_ADMIN_ENTRIES (128)
+
+/*
+ * NVME_MAX_SGL_DESCRIPTORS defines the maximum number of descriptors in one SGL
+ * segment.
+ */
+#define NVME_MAX_SGL_DESCRIPTORS (253)
+
+#define NVME_MAX_PRP_LIST_ENTRIES (506)
+
+struct nvme_pcie_enum_ctx {
+ spdk_nvme_probe_cb probe_cb;
+ void *cb_ctx;
+ struct spdk_pci_addr pci_addr;
+ bool has_pci_addr;
+};
+
+/* PCIe transport extensions for spdk_nvme_ctrlr */
+struct nvme_pcie_ctrlr {
+ struct spdk_nvme_ctrlr ctrlr;
+
+ /** NVMe MMIO register space */
+ volatile struct spdk_nvme_registers *regs;
+
+ /** NVMe MMIO register size */
+ uint64_t regs_size;
+
+ /* BAR mapping address which contains controller memory buffer */
+ void *cmb_bar_virt_addr;
+
+ /* BAR physical address which contains controller memory buffer */
+ uint64_t cmb_bar_phys_addr;
+
+ /* Controller memory buffer size in Bytes */
+ uint64_t cmb_size;
+
+ /* Current offset of controller memory buffer, relative to start of BAR virt addr */
+ uint64_t cmb_current_offset;
+
+ /* Last valid offset into CMB, this differs if CMB memory registration occurs or not */
+ uint64_t cmb_max_offset;
+
+ void *cmb_mem_register_addr;
+ size_t cmb_mem_register_size;
+
+ bool cmb_io_data_supported;
+
+ /** stride in uint32_t units between doorbell registers (1 = 4 bytes, 2 = 8 bytes, ...) */
+ uint32_t doorbell_stride_u32;
+
+ /* Opaque handle to associated PCI device. */
+ struct spdk_pci_device *devhandle;
+
+ /* File descriptor returned from spdk_pci_device_claim(). Closed when ctrlr is detached. */
+ int claim_fd;
+
+ /* Flag to indicate the MMIO register has been remapped */
+ bool is_remapped;
+};
+
+struct nvme_tracker {
+ TAILQ_ENTRY(nvme_tracker) tq_list;
+
+ struct nvme_request *req;
+ uint16_t cid;
+
+ uint16_t rsvd1: 15;
+ uint16_t active: 1;
+
+ uint32_t rsvd2;
+
+ uint64_t rsvd3;
+
+ uint64_t prp_sgl_bus_addr;
+
+ union {
+ uint64_t prp[NVME_MAX_PRP_LIST_ENTRIES];
+ struct spdk_nvme_sgl_descriptor sgl[NVME_MAX_SGL_DESCRIPTORS];
+ } u;
+};
+/*
+ * struct nvme_tracker must be exactly 4K so that the prp[] array does not cross a page boundary
+ * and so that there is no padding required to meet alignment requirements.
+ */
+SPDK_STATIC_ASSERT(sizeof(struct nvme_tracker) == 4096, "nvme_tracker is not 4K");
+SPDK_STATIC_ASSERT((offsetof(struct nvme_tracker, u.sgl) & 7) == 0, "SGL must be Qword aligned");
+
+/* PCIe transport extensions for spdk_nvme_qpair */
+struct nvme_pcie_qpair {
+ /* Submission queue tail doorbell */
+ volatile uint32_t *sq_tdbl;
+
+ /* Completion queue head doorbell */
+ volatile uint32_t *cq_hdbl;
+
+ /* Submission queue shadow tail doorbell */
+ volatile uint32_t *sq_shadow_tdbl;
+
+ /* Completion queue shadow head doorbell */
+ volatile uint32_t *cq_shadow_hdbl;
+
+ /* Submission queue event index */
+ volatile uint32_t *sq_eventidx;
+
+ /* Completion queue event index */
+ volatile uint32_t *cq_eventidx;
+
+ /* Submission queue */
+ struct spdk_nvme_cmd *cmd;
+
+ /* Completion queue */
+ struct spdk_nvme_cpl *cpl;
+
+ TAILQ_HEAD(, nvme_tracker) free_tr;
+ TAILQ_HEAD(nvme_outstanding_tr_head, nvme_tracker) outstanding_tr;
+
+ /* Array of trackers indexed by command ID. */
+ struct nvme_tracker *tr;
+
+ uint16_t num_entries;
+
+ uint16_t max_completions_cap;
+
+ uint16_t sq_tail;
+ uint16_t cq_head;
+ uint16_t sq_head;
+
+ uint8_t phase;
+
+ bool is_enabled;
+
+ /*
+ * Base qpair structure.
+ * This is located after the hot data in this structure so that the important parts of
+ * nvme_pcie_qpair are in the same cache line.
+ */
+ struct spdk_nvme_qpair qpair;
+
+ /*
+ * Fields below this point should not be touched on the normal I/O path.
+ */
+
+ bool sq_in_cmb;
+
+ uint64_t cmd_bus_addr;
+ uint64_t cpl_bus_addr;
+};
+
+static int nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx,
+ struct spdk_pci_addr *pci_addr);
+static int nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair);
+static int nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair);
+
+__thread struct nvme_pcie_ctrlr *g_thread_mmio_ctrlr = NULL;
+static volatile uint16_t g_signal_lock;
+static bool g_sigset = false;
+static int hotplug_fd = -1;
+
+static void
+nvme_sigbus_fault_sighandler(int signum, siginfo_t *info, void *ctx)
+{
+ void *map_address;
+
+ if (!__sync_bool_compare_and_swap(&g_signal_lock, 0, 1)) {
+ return;
+ }
+
+ assert(g_thread_mmio_ctrlr != NULL);
+
+ if (!g_thread_mmio_ctrlr->is_remapped) {
+ map_address = mmap((void *)g_thread_mmio_ctrlr->regs, g_thread_mmio_ctrlr->regs_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (map_address == MAP_FAILED) {
+ SPDK_ERRLOG("mmap failed\n");
+ g_signal_lock = 0;
+ return;
+ }
+ memset(map_address, 0xFF, sizeof(struct spdk_nvme_registers));
+ g_thread_mmio_ctrlr->regs = (volatile struct spdk_nvme_registers *)map_address;
+ g_thread_mmio_ctrlr->is_remapped = true;
+ }
+ g_signal_lock = 0;
+ return;
+}
+
+static void
+nvme_pcie_ctrlr_setup_signal(void)
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = nvme_sigbus_fault_sighandler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ sigaction(SIGBUS, &sa, NULL);
+}
+
+static int
+_nvme_pcie_hotplug_monitor(void *cb_ctx, spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_remove_cb remove_cb)
+{
+ struct spdk_nvme_ctrlr *ctrlr, *tmp;
+ struct spdk_uevent event;
+ struct spdk_pci_addr pci_addr;
+ union spdk_nvme_csts_register csts;
+ struct spdk_nvme_ctrlr_process *proc;
+
+ while (spdk_get_uevent(hotplug_fd, &event) > 0) {
+ if (event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_UIO ||
+ event.subsystem == SPDK_NVME_UEVENT_SUBSYSTEM_VFIO) {
+ if (event.action == SPDK_NVME_UEVENT_ADD) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "add nvme address: %s\n",
+ event.traddr);
+ if (spdk_process_is_primary()) {
+ if (!spdk_pci_addr_parse(&pci_addr, event.traddr)) {
+ nvme_pcie_ctrlr_attach(probe_cb, cb_ctx, &pci_addr);
+ }
+ }
+ } else if (event.action == SPDK_NVME_UEVENT_REMOVE) {
+ struct spdk_nvme_transport_id trid;
+
+ memset(&trid, 0, sizeof(trid));
+ trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
+ snprintf(trid.traddr, sizeof(trid.traddr), "%s", event.traddr);
+
+ ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
+ if (ctrlr == NULL) {
+ return 0;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "remove nvme address: %s\n",
+ event.traddr);
+
+ nvme_ctrlr_fail(ctrlr, true);
+
+ /* get the user app to clean up and stop I/O */
+ if (remove_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ remove_cb(cb_ctx, ctrlr);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+ }
+ }
+
+ /* This is a work around for vfio-attached device hot remove detection. */
+ TAILQ_FOREACH_SAFE(ctrlr, &g_spdk_nvme_driver->shared_attached_ctrlrs, tailq, tmp) {
+ /* NVMe controller BAR must be mapped to secondary process space before any access. */
+ proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (proc) {
+ csts = spdk_nvme_ctrlr_get_regs_csts(ctrlr);
+ if (csts.raw == 0xffffffffU) {
+ nvme_ctrlr_fail(ctrlr, true);
+ if (remove_cb) {
+ nvme_robust_mutex_unlock(&g_spdk_nvme_driver->lock);
+ remove_cb(cb_ctx, ctrlr);
+ nvme_robust_mutex_lock(&g_spdk_nvme_driver->lock);
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+static inline struct nvme_pcie_ctrlr *
+nvme_pcie_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_PCIE);
+ return SPDK_CONTAINEROF(ctrlr, struct nvme_pcie_ctrlr, ctrlr);
+}
+
+static inline struct nvme_pcie_qpair *
+nvme_pcie_qpair(struct spdk_nvme_qpair *qpair)
+{
+ assert(qpair->trtype == SPDK_NVME_TRANSPORT_PCIE);
+ return SPDK_CONTAINEROF(qpair, struct nvme_pcie_qpair, qpair);
+}
+
+static volatile void *
+nvme_pcie_reg_addr(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ return (volatile void *)((uintptr_t)pctrlr->regs + offset);
+}
+
+int
+nvme_pcie_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(nvme_pcie_reg_addr(ctrlr, offset), value);
+ g_thread_mmio_ctrlr = NULL;
+ return 0;
+}
+
+int
+nvme_pcie_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_8(nvme_pcie_reg_addr(ctrlr, offset), value);
+ g_thread_mmio_ctrlr = NULL;
+ return 0;
+}
+
+int
+nvme_pcie_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 4);
+ assert(value != NULL);
+ g_thread_mmio_ctrlr = pctrlr;
+ *value = spdk_mmio_read_4(nvme_pcie_reg_addr(ctrlr, offset));
+ g_thread_mmio_ctrlr = NULL;
+ if (~(*value) == 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+nvme_pcie_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+
+ assert(offset <= sizeof(struct spdk_nvme_registers) - 8);
+ assert(value != NULL);
+ g_thread_mmio_ctrlr = pctrlr;
+ *value = spdk_mmio_read_8(nvme_pcie_reg_addr(ctrlr, offset));
+ g_thread_mmio_ctrlr = NULL;
+ if (~(*value) == 0) {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_set_asq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+ return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, asq),
+ value);
+}
+
+static int
+nvme_pcie_ctrlr_set_acq(struct nvme_pcie_ctrlr *pctrlr, uint64_t value)
+{
+ return nvme_pcie_ctrlr_set_reg_8(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, acq),
+ value);
+}
+
+static int
+nvme_pcie_ctrlr_set_aqa(struct nvme_pcie_ctrlr *pctrlr, const union spdk_nvme_aqa_register *aqa)
+{
+ return nvme_pcie_ctrlr_set_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, aqa.raw),
+ aqa->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbloc(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbloc_register *cmbloc)
+{
+ return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbloc.raw),
+ &cmbloc->raw);
+}
+
+static int
+nvme_pcie_ctrlr_get_cmbsz(struct nvme_pcie_ctrlr *pctrlr, union spdk_nvme_cmbsz_register *cmbsz)
+{
+ return nvme_pcie_ctrlr_get_reg_4(&pctrlr->ctrlr, offsetof(struct spdk_nvme_registers, cmbsz.raw),
+ &cmbsz->raw);
+}
+
+uint32_t
+nvme_pcie_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /*
+ * For commands requiring more than 2 PRP entries, one PRP will be
+ * embedded in the command (prp1), and the rest of the PRP entries
+ * will be in a list pointed to by the command (prp2). This means
+ * that real max number of PRP entries we support is 506+1, which
+ * results in a max xfer size of 506*ctrlr->page_size.
+ */
+ return NVME_MAX_PRP_LIST_ENTRIES * ctrlr->page_size;
+}
+
+uint16_t
+nvme_pcie_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return NVME_MAX_SGL_DESCRIPTORS;
+}
+
+static void
+nvme_pcie_ctrlr_map_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc;
+ void *addr;
+ uint32_t bir;
+ union spdk_nvme_cmbsz_register cmbsz;
+ union spdk_nvme_cmbloc_register cmbloc;
+ uint64_t size, unit_size, offset, bar_size, bar_phys_addr;
+ uint64_t mem_register_start, mem_register_end;
+
+ if (nvme_pcie_ctrlr_get_cmbsz(pctrlr, &cmbsz) ||
+ nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+ SPDK_ERRLOG("get registers failed\n");
+ goto exit;
+ }
+
+ if (!cmbsz.bits.sz) {
+ goto exit;
+ }
+
+ bir = cmbloc.bits.bir;
+ /* Values 0 2 3 4 5 are valid for BAR */
+ if (bir > 5 || bir == 1) {
+ goto exit;
+ }
+
+ /* unit size for 4KB/64KB/1MB/16MB/256MB/4GB/64GB */
+ unit_size = (uint64_t)1 << (12 + 4 * cmbsz.bits.szu);
+ /* controller memory buffer size in Bytes */
+ size = unit_size * cmbsz.bits.sz;
+ /* controller memory buffer offset from BAR in Bytes */
+ offset = unit_size * cmbloc.bits.ofst;
+
+ rc = spdk_pci_device_map_bar(pctrlr->devhandle, bir, &addr,
+ &bar_phys_addr, &bar_size);
+ if ((rc != 0) || addr == NULL) {
+ goto exit;
+ }
+
+ if (offset > bar_size) {
+ goto exit;
+ }
+
+ if (size > bar_size - offset) {
+ goto exit;
+ }
+
+ pctrlr->cmb_bar_virt_addr = addr;
+ pctrlr->cmb_bar_phys_addr = bar_phys_addr;
+ pctrlr->cmb_size = size;
+ pctrlr->cmb_current_offset = offset;
+ pctrlr->cmb_max_offset = offset + size;
+
+ if (!cmbsz.bits.sqs) {
+ pctrlr->ctrlr.opts.use_cmb_sqs = false;
+ }
+
+ /* If only SQS is supported use legacy mapping */
+ if (cmbsz.bits.sqs && !(cmbsz.bits.wds || cmbsz.bits.rds)) {
+ return;
+ }
+
+ /* If CMB is less than 4MiB in size then abort CMB mapping */
+ if (pctrlr->cmb_size < (1ULL << 22)) {
+ goto exit;
+ }
+
+ mem_register_start = (((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + 0x1fffff) & ~(0x200000 - 1));
+ mem_register_end = ((uintptr_t)pctrlr->cmb_bar_virt_addr + offset + pctrlr->cmb_size);
+ mem_register_end &= ~(uint64_t)(0x200000 - 1);
+ pctrlr->cmb_mem_register_addr = (void *)mem_register_start;
+ pctrlr->cmb_mem_register_size = mem_register_end - mem_register_start;
+
+ rc = spdk_mem_register(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
+ if (rc) {
+ SPDK_ERRLOG("spdk_mem_register() failed\n");
+ goto exit;
+ }
+ pctrlr->cmb_current_offset = mem_register_start - ((uint64_t)pctrlr->cmb_bar_virt_addr);
+ pctrlr->cmb_max_offset = mem_register_end - ((uint64_t)pctrlr->cmb_bar_virt_addr);
+ pctrlr->cmb_io_data_supported = true;
+
+ return;
+exit:
+ pctrlr->cmb_bar_virt_addr = NULL;
+ pctrlr->ctrlr.opts.use_cmb_sqs = false;
+ return;
+}
+
+static int
+nvme_pcie_ctrlr_unmap_cmb(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc = 0;
+ union spdk_nvme_cmbloc_register cmbloc;
+ void *addr = pctrlr->cmb_bar_virt_addr;
+
+ if (addr) {
+ if (pctrlr->cmb_mem_register_addr) {
+ spdk_mem_unregister(pctrlr->cmb_mem_register_addr, pctrlr->cmb_mem_register_size);
+ }
+
+ if (nvme_pcie_ctrlr_get_cmbloc(pctrlr, &cmbloc)) {
+ SPDK_ERRLOG("get_cmbloc() failed\n");
+ return -EIO;
+ }
+ rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, cmbloc.bits.bir, addr);
+ }
+ return rc;
+}
+
+static int
+nvme_pcie_ctrlr_alloc_cmb(struct spdk_nvme_ctrlr *ctrlr, uint64_t length, uint64_t aligned,
+ uint64_t *offset)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ uint64_t round_offset;
+
+ round_offset = pctrlr->cmb_current_offset;
+ round_offset = (round_offset + (aligned - 1)) & ~(aligned - 1);
+
+ /* CMB may only consume part of the BAR, calculate accordingly */
+ if (round_offset + length > pctrlr->cmb_max_offset) {
+ SPDK_ERRLOG("Tried to allocate past valid CMB range!\n");
+ return -1;
+ }
+
+ *offset = round_offset;
+ pctrlr->cmb_current_offset = round_offset + length;
+
+ return 0;
+}
+
+void *
+nvme_pcie_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ uint64_t offset;
+
+ if (pctrlr->cmb_bar_virt_addr == NULL) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB not available\n");
+ return NULL;
+ }
+
+ if (!pctrlr->cmb_io_data_supported) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CMB doesn't support I/O data\n");
+ return NULL;
+ }
+
+ if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, size, 4, &offset) != 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "%zu-byte CMB allocation failed\n", size);
+ return NULL;
+ }
+
+ return pctrlr->cmb_bar_virt_addr + offset;
+}
+
+int
+nvme_pcie_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
+{
+ /*
+ * Do nothing for now.
+ * TODO: Track free space so buffers may be reused.
+ */
+ SPDK_ERRLOG("%s: no deallocation for CMB buffers yet!\n",
+ __func__);
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_allocate_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc;
+ void *addr;
+ uint64_t phys_addr, size;
+
+ rc = spdk_pci_device_map_bar(pctrlr->devhandle, 0, &addr,
+ &phys_addr, &size);
+ pctrlr->regs = (volatile struct spdk_nvme_registers *)addr;
+ if ((pctrlr->regs == NULL) || (rc != 0)) {
+ SPDK_ERRLOG("nvme_pcicfg_map_bar failed with rc %d or bar %p\n",
+ rc, pctrlr->regs);
+ return -1;
+ }
+
+ pctrlr->regs_size = size;
+ nvme_pcie_ctrlr_map_cmb(pctrlr);
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_free_bars(struct nvme_pcie_ctrlr *pctrlr)
+{
+ int rc = 0;
+ void *addr = (void *)pctrlr->regs;
+
+ if (pctrlr->ctrlr.is_removed) {
+ return rc;
+ }
+
+ rc = nvme_pcie_ctrlr_unmap_cmb(pctrlr);
+ if (rc != 0) {
+ SPDK_ERRLOG("nvme_ctrlr_unmap_cmb failed with error code %d\n", rc);
+ return -1;
+ }
+
+ if (addr) {
+ /* NOTE: addr may have been remapped here. We're relying on DPDK to call
+ * munmap internally.
+ */
+ rc = spdk_pci_device_unmap_bar(pctrlr->devhandle, 0, addr);
+ }
+ return rc;
+}
+
+static int
+nvme_pcie_ctrlr_construct_admin_qpair(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_qpair *pqpair;
+ int rc;
+
+ pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL, SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair == NULL) {
+ return -ENOMEM;
+ }
+
+ pqpair->num_entries = NVME_ADMIN_ENTRIES;
+
+ ctrlr->adminq = &pqpair->qpair;
+
+ rc = nvme_qpair_init(ctrlr->adminq,
+ 0, /* qpair ID */
+ ctrlr,
+ SPDK_NVME_QPRIO_URGENT,
+ NVME_ADMIN_ENTRIES);
+ if (rc != 0) {
+ return rc;
+ }
+
+ return nvme_pcie_qpair_construct(ctrlr->adminq);
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+static int
+pcie_nvme_enum_cb(void *ctx, struct spdk_pci_device *pci_dev)
+{
+ struct spdk_nvme_transport_id trid = {};
+ struct nvme_pcie_enum_ctx *enum_ctx = ctx;
+ struct spdk_nvme_ctrlr *ctrlr;
+ struct spdk_pci_addr pci_addr;
+
+ pci_addr = spdk_pci_device_get_addr(pci_dev);
+
+ trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
+ spdk_pci_addr_fmt(trid.traddr, sizeof(trid.traddr), &pci_addr);
+
+ /* Verify that this controller is not already attached */
+ ctrlr = spdk_nvme_get_ctrlr_by_trid_unsafe(&trid);
+ if (ctrlr) {
+ if (spdk_process_is_primary()) {
+ /* Already attached */
+ return 0;
+ } else {
+ return nvme_ctrlr_add_process(ctrlr, pci_dev);
+ }
+ }
+
+ /* check whether user passes the pci_addr */
+ if (enum_ctx->has_pci_addr &&
+ (spdk_pci_addr_compare(&pci_addr, &enum_ctx->pci_addr) != 0)) {
+ return 1;
+ }
+
+ return nvme_ctrlr_probe(&trid, pci_dev,
+ enum_ctx->probe_cb, enum_ctx->cb_ctx);
+}
+
+int
+nvme_pcie_ctrlr_scan(const struct spdk_nvme_transport_id *trid,
+ void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_remove_cb remove_cb,
+ bool direct_connect)
+{
+ struct nvme_pcie_enum_ctx enum_ctx = {};
+
+ enum_ctx.probe_cb = probe_cb;
+ enum_ctx.cb_ctx = cb_ctx;
+
+ if (strlen(trid->traddr) != 0) {
+ if (spdk_pci_addr_parse(&enum_ctx.pci_addr, trid->traddr)) {
+ return -1;
+ }
+ enum_ctx.has_pci_addr = true;
+ }
+
+ if (hotplug_fd < 0) {
+ hotplug_fd = spdk_uevent_connect();
+ if (hotplug_fd < 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Failed to open uevent netlink socket\n");
+ }
+ } else {
+ _nvme_pcie_hotplug_monitor(cb_ctx, probe_cb, remove_cb);
+ }
+
+ if (enum_ctx.has_pci_addr == false) {
+ return spdk_pci_nvme_enumerate(pcie_nvme_enum_cb, &enum_ctx);
+ } else {
+ return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, &enum_ctx.pci_addr);
+ }
+}
+
+static int
+nvme_pcie_ctrlr_attach(spdk_nvme_probe_cb probe_cb, void *cb_ctx, struct spdk_pci_addr *pci_addr)
+{
+ struct nvme_pcie_enum_ctx enum_ctx;
+
+ enum_ctx.probe_cb = probe_cb;
+ enum_ctx.cb_ctx = cb_ctx;
+
+ return spdk_pci_nvme_device_attach(pcie_nvme_enum_cb, &enum_ctx, pci_addr);
+}
+
+struct spdk_nvme_ctrlr *nvme_pcie_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ struct spdk_pci_device *pci_dev = devhandle;
+ struct nvme_pcie_ctrlr *pctrlr;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+ uint32_t cmd_reg;
+ int rc, claim_fd;
+ struct spdk_pci_id pci_id;
+ struct spdk_pci_addr pci_addr;
+
+ if (spdk_pci_addr_parse(&pci_addr, trid->traddr)) {
+ SPDK_ERRLOG("could not parse pci address\n");
+ return NULL;
+ }
+
+ claim_fd = spdk_pci_device_claim(&pci_addr);
+ if (claim_fd < 0) {
+ SPDK_ERRLOG("could not claim device %s\n", trid->traddr);
+ return NULL;
+ }
+
+ pctrlr = spdk_zmalloc(sizeof(struct nvme_pcie_ctrlr), 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pctrlr == NULL) {
+ close(claim_fd);
+ SPDK_ERRLOG("could not allocate ctrlr\n");
+ return NULL;
+ }
+
+ pctrlr->is_remapped = false;
+ pctrlr->ctrlr.is_removed = false;
+ pctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_PCIE;
+ pctrlr->devhandle = devhandle;
+ pctrlr->ctrlr.opts = *opts;
+ pctrlr->claim_fd = claim_fd;
+ memcpy(&pctrlr->ctrlr.trid, trid, sizeof(pctrlr->ctrlr.trid));
+
+ rc = nvme_pcie_ctrlr_allocate_bars(pctrlr);
+ if (rc != 0) {
+ close(claim_fd);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ /* Enable PCI busmaster and disable INTx */
+ spdk_pci_device_cfg_read32(pci_dev, &cmd_reg, 4);
+ cmd_reg |= 0x404;
+ spdk_pci_device_cfg_write32(pci_dev, cmd_reg, 4);
+
+ if (nvme_ctrlr_get_cap(&pctrlr->ctrlr, &cap)) {
+ SPDK_ERRLOG("get_cap() failed\n");
+ close(claim_fd);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_vs(&pctrlr->ctrlr, &vs)) {
+ SPDK_ERRLOG("get_vs() failed\n");
+ close(claim_fd);
+ spdk_free(pctrlr);
+ return NULL;
+ }
+
+ nvme_ctrlr_init_cap(&pctrlr->ctrlr, &cap, &vs);
+
+ /* Doorbell stride is 2 ^ (dstrd + 2),
+ * but we want multiples of 4, so drop the + 2 */
+ pctrlr->doorbell_stride_u32 = 1 << cap.bits.dstrd;
+
+ rc = nvme_ctrlr_construct(&pctrlr->ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_destruct(&pctrlr->ctrlr);
+ return NULL;
+ }
+
+ pci_id = spdk_pci_device_get_id(pci_dev);
+ pctrlr->ctrlr.quirks = nvme_get_quirks(&pci_id);
+
+ rc = nvme_pcie_ctrlr_construct_admin_qpair(&pctrlr->ctrlr);
+ if (rc != 0) {
+ nvme_ctrlr_destruct(&pctrlr->ctrlr);
+ return NULL;
+ }
+
+ /* Construct the primary process properties */
+ rc = nvme_ctrlr_add_process(&pctrlr->ctrlr, pci_dev);
+ if (rc != 0) {
+ nvme_ctrlr_destruct(&pctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (g_sigset != true) {
+ nvme_pcie_ctrlr_setup_signal();
+ g_sigset = true;
+ }
+
+ return &pctrlr->ctrlr;
+}
+
+int
+nvme_pcie_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *padminq = nvme_pcie_qpair(ctrlr->adminq);
+ union spdk_nvme_aqa_register aqa;
+
+ if (nvme_pcie_ctrlr_set_asq(pctrlr, padminq->cmd_bus_addr)) {
+ SPDK_ERRLOG("set_asq() failed\n");
+ return -EIO;
+ }
+
+ if (nvme_pcie_ctrlr_set_acq(pctrlr, padminq->cpl_bus_addr)) {
+ SPDK_ERRLOG("set_acq() failed\n");
+ return -EIO;
+ }
+
+ aqa.raw = 0;
+ /* acqs and asqs are 0-based. */
+ aqa.bits.acqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+ aqa.bits.asqs = nvme_pcie_qpair(ctrlr->adminq)->num_entries - 1;
+
+ if (nvme_pcie_ctrlr_set_aqa(pctrlr, &aqa)) {
+ SPDK_ERRLOG("set_aqa() failed\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+int
+nvme_pcie_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct spdk_pci_device *devhandle = nvme_ctrlr_proc_get_devhandle(ctrlr);
+
+ close(pctrlr->claim_fd);
+
+ if (ctrlr->adminq) {
+ nvme_pcie_qpair_destroy(ctrlr->adminq);
+ }
+
+ nvme_ctrlr_destruct_finish(ctrlr);
+
+ nvme_ctrlr_free_processes(ctrlr);
+
+ nvme_pcie_ctrlr_free_bars(pctrlr);
+
+ if (devhandle) {
+ spdk_pci_device_detach(devhandle);
+ }
+
+ spdk_free(pctrlr);
+
+ return 0;
+}
+
+static void
+nvme_qpair_construct_tracker(struct nvme_tracker *tr, uint16_t cid, uint64_t phys_addr)
+{
+ tr->prp_sgl_bus_addr = phys_addr + offsetof(struct nvme_tracker, u.prp);
+ tr->cid = cid;
+ tr->active = false;
+}
+
+int
+nvme_pcie_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ pqpair->sq_tail = pqpair->cq_head = 0;
+
+ /*
+ * First time through the completion queue, HW will set phase
+ * bit on completions to 1. So set this to 1 here, indicating
+ * we're looking for a 1 to know which entries have completed.
+ * we'll toggle the bit each time when the completion queue
+ * rolls over.
+ */
+ pqpair->phase = 1;
+
+ memset(pqpair->cmd, 0,
+ pqpair->num_entries * sizeof(struct spdk_nvme_cmd));
+ memset(pqpair->cpl, 0,
+ pqpair->num_entries * sizeof(struct spdk_nvme_cpl));
+
+ return 0;
+}
+
+static int
+nvme_pcie_qpair_construct(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr;
+ uint16_t i;
+ volatile uint32_t *doorbell_base;
+ uint64_t offset;
+ uint16_t num_trackers;
+ size_t page_align = 0x200000;
+ uint32_t flags = SPDK_MALLOC_DMA;
+
+ /*
+ * Limit the maximum number of completions to return per call to prevent wraparound,
+ * and calculate how many trackers can be submitted at once without overflowing the
+ * completion queue.
+ */
+ pqpair->max_completions_cap = pqpair->num_entries / 4;
+ pqpair->max_completions_cap = spdk_max(pqpair->max_completions_cap, NVME_MIN_COMPLETIONS);
+ pqpair->max_completions_cap = spdk_min(pqpair->max_completions_cap, NVME_MAX_COMPLETIONS);
+ num_trackers = pqpair->num_entries - pqpair->max_completions_cap;
+
+ SPDK_INFOLOG(SPDK_LOG_NVME, "max_completions_cap = %" PRIu16 " num_trackers = %" PRIu16 "\n",
+ pqpair->max_completions_cap, num_trackers);
+
+ assert(num_trackers != 0);
+
+ pqpair->sq_in_cmb = false;
+
+ if (nvme_qpair_is_admin_queue(&pqpair->qpair)) {
+ flags |= SPDK_MALLOC_SHARE;
+ }
+
+ /* cmd and cpl rings must be aligned on page size boundaries. */
+ if (ctrlr->opts.use_cmb_sqs) {
+ if (nvme_pcie_ctrlr_alloc_cmb(ctrlr, pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
+ sysconf(_SC_PAGESIZE), &offset) == 0) {
+ pqpair->cmd = pctrlr->cmb_bar_virt_addr + offset;
+ pqpair->cmd_bus_addr = pctrlr->cmb_bar_phys_addr + offset;
+ pqpair->sq_in_cmb = true;
+ }
+ }
+
+ /* To ensure physical address contiguity we make each ring occupy
+ * a single hugepage only. See MAX_IO_QUEUE_ENTRIES.
+ */
+ if (pqpair->sq_in_cmb == false) {
+ pqpair->cmd = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cmd),
+ page_align, &pqpair->cmd_bus_addr,
+ SPDK_ENV_SOCKET_ID_ANY, flags);
+ if (pqpair->cmd == NULL) {
+ SPDK_ERRLOG("alloc qpair_cmd failed\n");
+ return -ENOMEM;
+ }
+ }
+
+ pqpair->cpl = spdk_zmalloc(pqpair->num_entries * sizeof(struct spdk_nvme_cpl),
+ page_align, &pqpair->cpl_bus_addr,
+ SPDK_ENV_SOCKET_ID_ANY, flags);
+ if (pqpair->cpl == NULL) {
+ SPDK_ERRLOG("alloc qpair_cpl failed\n");
+ return -ENOMEM;
+ }
+
+ doorbell_base = &pctrlr->regs->doorbell[0].sq_tdbl;
+ pqpair->sq_tdbl = doorbell_base + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
+ pqpair->cq_hdbl = doorbell_base + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
+
+ /*
+ * Reserve space for all of the trackers in a single allocation.
+ * struct nvme_tracker must be padded so that its size is already a power of 2.
+ * This ensures the PRP list embedded in the nvme_tracker object will not span a
+ * 4KB boundary, while allowing access to trackers in tr[] via normal array indexing.
+ */
+ pqpair->tr = spdk_zmalloc(num_trackers * sizeof(*tr), sizeof(*tr), NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair->tr == NULL) {
+ SPDK_ERRLOG("nvme_tr failed\n");
+ return -ENOMEM;
+ }
+
+ TAILQ_INIT(&pqpair->free_tr);
+ TAILQ_INIT(&pqpair->outstanding_tr);
+
+ for (i = 0; i < num_trackers; i++) {
+ tr = &pqpair->tr[i];
+ nvme_qpair_construct_tracker(tr, i, spdk_vtophys(tr));
+ TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+ }
+
+ nvme_pcie_qpair_reset(qpair);
+
+ return 0;
+}
+
+static inline void
+nvme_pcie_copy_command(struct spdk_nvme_cmd *dst, const struct spdk_nvme_cmd *src)
+{
+ /* dst and src are known to be non-overlapping and 64-byte aligned. */
+#if defined(__AVX__)
+ __m256i *d256 = (__m256i *)dst;
+ const __m256i *s256 = (const __m256i *)src;
+
+ _mm256_store_si256(&d256[0], _mm256_load_si256(&s256[0]));
+ _mm256_store_si256(&d256[1], _mm256_load_si256(&s256[1]));
+#elif defined(__SSE2__)
+ __m128i *d128 = (__m128i *)dst;
+ const __m128i *s128 = (const __m128i *)src;
+
+ _mm_store_si128(&d128[0], _mm_load_si128(&s128[0]));
+ _mm_store_si128(&d128[1], _mm_load_si128(&s128[1]));
+ _mm_store_si128(&d128[2], _mm_load_si128(&s128[2]));
+ _mm_store_si128(&d128[3], _mm_load_si128(&s128[3]));
+#else
+ *dst = *src;
+#endif
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_insert_pending_admin_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, struct spdk_nvme_cpl *cpl)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_request *active_req = req;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /*
+ * The admin request is from another process. Move to the per
+ * process list for that process to handle it later.
+ */
+ assert(nvme_qpair_is_admin_queue(qpair));
+ assert(active_req->pid != getpid());
+
+ active_proc = spdk_nvme_ctrlr_get_process(ctrlr, active_req->pid);
+ if (active_proc) {
+ /* Save the original completion information */
+ memcpy(&active_req->cpl, cpl, sizeof(*cpl));
+ STAILQ_INSERT_TAIL(&active_proc->active_reqs, active_req, stailq);
+ } else {
+ SPDK_ERRLOG("The owning process (pid %d) is not found. Dropping the request.\n",
+ active_req->pid);
+
+ nvme_free_request(active_req);
+ }
+}
+
+/**
+ * Note: the ctrlr_lock must be held when calling this function.
+ */
+static void
+nvme_pcie_qpair_complete_pending_admin_request(struct spdk_nvme_qpair *qpair)
+{
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_request *req, *tmp_req;
+ pid_t pid = getpid();
+ struct spdk_nvme_ctrlr_process *proc;
+
+ /*
+ * Check whether there is any pending admin request from
+ * other active processes.
+ */
+ assert(nvme_qpair_is_admin_queue(qpair));
+
+ proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ if (!proc) {
+ SPDK_ERRLOG("the active process (pid %d) is not found for this controller.\n", pid);
+ assert(proc);
+ return;
+ }
+
+ STAILQ_FOREACH_SAFE(req, &proc->active_reqs, stailq, tmp_req) {
+ STAILQ_REMOVE(&proc->active_reqs, req, nvme_request, stailq);
+
+ assert(req->pid == pid);
+
+ nvme_complete_request(req, &req->cpl);
+ nvme_free_request(req);
+ }
+}
+
+static inline int
+nvme_pcie_qpair_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old)
+{
+ return (uint16_t)(new_idx - event_idx) <= (uint16_t)(new_idx - old);
+}
+
+static bool
+nvme_pcie_qpair_update_mmio_required(struct spdk_nvme_qpair *qpair, uint16_t value,
+ volatile uint32_t *shadow_db,
+ volatile uint32_t *eventidx)
+{
+ uint16_t old;
+
+ if (!shadow_db) {
+ return true;
+ }
+
+ old = *shadow_db;
+ *shadow_db = value;
+
+ if (!nvme_pcie_qpair_need_event(*eventidx, value, old)) {
+ return false;
+ }
+
+ return true;
+}
+
+static void
+nvme_pcie_qpair_submit_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+ struct nvme_request *req;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+
+ req = tr->req;
+ assert(req != NULL);
+ req->timed_out = false;
+ if (spdk_unlikely(pctrlr->ctrlr.timeout_enabled)) {
+ req->submit_tick = spdk_get_ticks();
+ } else {
+ req->submit_tick = 0;
+ }
+
+ pqpair->tr[tr->cid].active = true;
+
+ /* Copy the command from the tracker to the submission queue. */
+ nvme_pcie_copy_command(&pqpair->cmd[pqpair->sq_tail], &req->cmd);
+
+ if (++pqpair->sq_tail == pqpair->num_entries) {
+ pqpair->sq_tail = 0;
+ }
+
+ if (pqpair->sq_tail == pqpair->sq_head) {
+ SPDK_ERRLOG("sq_tail is passing sq_head!\n");
+ }
+
+ spdk_wmb();
+ if (spdk_likely(nvme_pcie_qpair_update_mmio_required(qpair,
+ pqpair->sq_tail,
+ pqpair->sq_shadow_tdbl,
+ pqpair->sq_eventidx))) {
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(pqpair->sq_tdbl, pqpair->sq_tail);
+ g_thread_mmio_ctrlr = NULL;
+ }
+}
+
+static void
+nvme_pcie_qpair_complete_tracker(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr,
+ struct spdk_nvme_cpl *cpl, bool print_on_error)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_request *req;
+ bool retry, error, was_active;
+ bool req_from_current_proc = true;
+
+ req = tr->req;
+
+ assert(req != NULL);
+
+ error = spdk_nvme_cpl_is_error(cpl);
+ retry = error && nvme_completion_is_retry(cpl) &&
+ req->retries < spdk_nvme_retry_count;
+
+ if (error && print_on_error) {
+ nvme_qpair_print_command(qpair, &req->cmd);
+ nvme_qpair_print_completion(qpair, cpl);
+ }
+
+ was_active = pqpair->tr[cpl->cid].active;
+ pqpair->tr[cpl->cid].active = false;
+
+ assert(cpl->cid == req->cmd.cid);
+
+ if (retry) {
+ req->retries++;
+ nvme_pcie_qpair_submit_tracker(qpair, tr);
+ } else {
+ if (was_active) {
+ /* Only check admin requests from different processes. */
+ if (nvme_qpair_is_admin_queue(qpair) && req->pid != getpid()) {
+ req_from_current_proc = false;
+ nvme_pcie_qpair_insert_pending_admin_request(qpair, req, cpl);
+ } else {
+ nvme_complete_request(req, cpl);
+ }
+ }
+
+ if (req_from_current_proc == true) {
+ nvme_free_request(req);
+ }
+
+ tr->req = NULL;
+
+ TAILQ_REMOVE(&pqpair->outstanding_tr, tr, tq_list);
+ TAILQ_INSERT_HEAD(&pqpair->free_tr, tr, tq_list);
+
+ /*
+ * If the controller is in the middle of resetting, don't
+ * try to submit queued requests here - let the reset logic
+ * handle that instead.
+ */
+ if (!STAILQ_EMPTY(&qpair->queued_req) &&
+ !qpair->ctrlr->is_resetting) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ nvme_qpair_submit_request(qpair, req);
+ }
+ }
+}
+
+static void
+nvme_pcie_qpair_manual_complete_tracker(struct spdk_nvme_qpair *qpair,
+ struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr,
+ bool print_on_error)
+{
+ struct spdk_nvme_cpl cpl;
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.sqid = qpair->id;
+ cpl.cid = tr->cid;
+ cpl.status.sct = sct;
+ cpl.status.sc = sc;
+ cpl.status.dnr = dnr;
+ nvme_pcie_qpair_complete_tracker(qpair, tr, &cpl, print_on_error);
+}
+
+static void
+nvme_pcie_qpair_abort_trackers(struct spdk_nvme_qpair *qpair, uint32_t dnr)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr, *temp;
+
+ TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, temp) {
+ SPDK_ERRLOG("aborting outstanding command\n");
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, dnr, true);
+ }
+}
+
+static void
+nvme_pcie_admin_qpair_abort_aers(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_tracker *tr;
+
+ tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+ while (tr != NULL) {
+ assert(tr->req != NULL);
+ if (tr->req->cmd.opc == SPDK_NVME_OPC_ASYNC_EVENT_REQUEST) {
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr,
+ SPDK_NVME_SCT_GENERIC, SPDK_NVME_SC_ABORTED_SQ_DELETION, 0,
+ false);
+ tr = TAILQ_FIRST(&pqpair->outstanding_tr);
+ } else {
+ tr = TAILQ_NEXT(tr, tq_list);
+ }
+ }
+}
+
+static void
+nvme_pcie_admin_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+ nvme_pcie_admin_qpair_abort_aers(qpair);
+}
+
+static int
+nvme_pcie_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_pcie_admin_qpair_destroy(qpair);
+ }
+ if (pqpair->cmd && !pqpair->sq_in_cmb) {
+ spdk_free(pqpair->cmd);
+ }
+ if (pqpair->cpl) {
+ spdk_free(pqpair->cpl);
+ }
+ if (pqpair->tr) {
+ spdk_free(pqpair->tr);
+ }
+
+ nvme_qpair_deinit(qpair);
+
+ spdk_free(pqpair);
+
+ return 0;
+}
+
+static void
+nvme_pcie_admin_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ /*
+ * Manually abort each outstanding admin command. Do not retry
+ * admin commands found here, since they will be left over from
+ * a controller reset and its likely the context in which the
+ * command was issued no longer applies.
+ */
+ nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
+}
+
+static void
+nvme_pcie_io_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ /* Manually abort each outstanding I/O. */
+ nvme_pcie_qpair_abort_trackers(qpair, 0);
+}
+
+int
+nvme_pcie_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ pqpair->is_enabled = true;
+ if (nvme_qpair_is_io_queue(qpair)) {
+ nvme_pcie_io_qpair_enable(qpair);
+ } else {
+ nvme_pcie_admin_qpair_enable(qpair);
+ }
+
+ return 0;
+}
+
+static void
+nvme_pcie_admin_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+ nvme_pcie_admin_qpair_abort_aers(qpair);
+}
+
+static void
+nvme_pcie_io_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+}
+
+int
+nvme_pcie_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ pqpair->is_enabled = false;
+ if (nvme_qpair_is_io_queue(qpair)) {
+ nvme_pcie_io_qpair_disable(qpair);
+ } else {
+ nvme_pcie_admin_qpair_disable(qpair);
+ }
+
+ return 0;
+}
+
+
+int
+nvme_pcie_qpair_fail(struct spdk_nvme_qpair *qpair)
+{
+ nvme_pcie_qpair_abort_trackers(qpair, 1 /* do not retry */);
+
+ return 0;
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_cq(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn,
+ void *cb_arg)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_CREATE_IO_CQ;
+
+ /*
+ * TODO: create a create io completion queue command data
+ * structure.
+ */
+ cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
+ /*
+ * 0x2 = interrupts enabled
+ * 0x1 = physically contiguous
+ */
+ cmd->cdw11 = 0x1;
+ cmd->dptr.prp.prp1 = pqpair->cpl_bus_addr;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_create_io_sq(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *io_que, spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(io_que);
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_CREATE_IO_SQ;
+
+ /*
+ * TODO: create a create io submission queue command data
+ * structure.
+ */
+ cmd->cdw10 = ((pqpair->num_entries - 1) << 16) | io_que->id;
+ /* 0x1 = physically contiguous */
+ cmd->cdw11 = (io_que->id << 16) | (io_que->qprio << 1) | 0x1;
+ cmd->dptr.prp.prp1 = pqpair->cmd_bus_addr;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_cq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DELETE_IO_CQ;
+ cmd->cdw10 = qpair->id;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+nvme_pcie_ctrlr_cmd_delete_io_sq(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ spdk_nvme_cmd_cb cb_fn, void *cb_arg)
+{
+ struct nvme_request *req;
+ struct spdk_nvme_cmd *cmd;
+
+ req = nvme_allocate_request_null(ctrlr->adminq, cb_fn, cb_arg);
+ if (req == NULL) {
+ return -ENOMEM;
+ }
+
+ cmd = &req->cmd;
+ cmd->opc = SPDK_NVME_OPC_DELETE_IO_SQ;
+ cmd->cdw10 = qpair->id;
+
+ return nvme_ctrlr_submit_admin_request(ctrlr, req);
+}
+
+static int
+_nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair,
+ uint16_t qid)
+{
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(ctrlr);
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_completion_poll_status status;
+ int rc;
+
+ rc = nvme_pcie_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("nvme_create_io_cq failed!\n");
+ return -1;
+ }
+
+ rc = nvme_pcie_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("nvme_create_io_sq failed!\n");
+ /* Attempt to delete the completion queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_cq(qpair->ctrlr, qpair, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return -1;
+ }
+ spdk_nvme_wait_for_completion(ctrlr->adminq, &status);
+ return -1;
+ }
+
+ if (ctrlr->shadow_doorbell) {
+ pqpair->sq_shadow_tdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
+ pqpair->cq_shadow_hdbl = ctrlr->shadow_doorbell + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
+ pqpair->sq_eventidx = ctrlr->eventidx + (2 * qpair->id + 0) * pctrlr->doorbell_stride_u32;
+ pqpair->cq_eventidx = ctrlr->eventidx + (2 * qpair->id + 1) * pctrlr->doorbell_stride_u32;
+ }
+ nvme_pcie_qpair_reset(qpair);
+
+ return 0;
+}
+
+struct spdk_nvme_qpair *
+nvme_pcie_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ struct nvme_pcie_qpair *pqpair;
+ struct spdk_nvme_qpair *qpair;
+ int rc;
+
+ assert(ctrlr != NULL);
+
+ pqpair = spdk_zmalloc(sizeof(*pqpair), 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (pqpair == NULL) {
+ return NULL;
+ }
+
+ pqpair->num_entries = opts->io_queue_size;
+
+ qpair = &pqpair->qpair;
+
+ rc = nvme_qpair_init(qpair, qid, ctrlr, opts->qprio, opts->io_queue_requests);
+ if (rc != 0) {
+ nvme_pcie_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ rc = nvme_pcie_qpair_construct(qpair);
+ if (rc != 0) {
+ nvme_pcie_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ rc = _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qid);
+
+ if (rc != 0) {
+ SPDK_ERRLOG("I/O queue creation failed\n");
+ nvme_pcie_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ return qpair;
+}
+
+int
+nvme_pcie_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ return _nvme_pcie_ctrlr_create_io_qpair(ctrlr, qpair, qpair->id);
+}
+
+int
+nvme_pcie_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_completion_poll_status status;
+ int rc;
+
+ assert(ctrlr != NULL);
+
+ if (ctrlr->is_removed) {
+ goto free;
+ }
+
+ /* Delete the I/O submission queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return rc;
+ }
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ return -1;
+ }
+
+ if (qpair->no_deletion_notification_needed == 0) {
+ /* Complete any I/O in the completion queue */
+ nvme_pcie_qpair_process_completions(qpair, 0);
+
+ /* Abort the rest of the I/O */
+ nvme_pcie_qpair_abort_trackers(qpair, 1);
+ }
+
+ /* Delete the completion queue */
+ rc = nvme_pcie_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ return rc;
+ }
+ if (spdk_nvme_wait_for_completion(ctrlr->adminq, &status)) {
+ return -1;
+ }
+
+free:
+ nvme_pcie_qpair_destroy(qpair);
+ return 0;
+}
+
+static void
+nvme_pcie_fail_request_bad_vtophys(struct spdk_nvme_qpair *qpair, struct nvme_tracker *tr)
+{
+ /*
+ * Bad vtophys translation, so abort this request and return
+ * immediately.
+ */
+ nvme_pcie_qpair_manual_complete_tracker(qpair, tr, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_INVALID_FIELD,
+ 1 /* do not retry */, true);
+}
+
+/*
+ * Append PRP list entries to describe a virtually contiguous buffer starting at virt_addr of len bytes.
+ *
+ * *prp_index will be updated to account for the number of PRP entries used.
+ */
+static int
+nvme_pcie_prp_list_append(struct nvme_tracker *tr, uint32_t *prp_index, void *virt_addr, size_t len,
+ uint32_t page_size)
+{
+ struct spdk_nvme_cmd *cmd = &tr->req->cmd;
+ uintptr_t page_mask = page_size - 1;
+ uint64_t phys_addr;
+ uint32_t i;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp_index:%u virt_addr:%p len:%u\n",
+ *prp_index, virt_addr, (uint32_t)len);
+
+ if (spdk_unlikely(((uintptr_t)virt_addr & 3) != 0)) {
+ SPDK_ERRLOG("virt_addr %p not dword aligned\n", virt_addr);
+ return -EINVAL;
+ }
+
+ i = *prp_index;
+ while (len) {
+ uint32_t seg_len;
+
+ /*
+ * prp_index 0 is stored in prp1, and the rest are stored in the prp[] array,
+ * so prp_index == count is valid.
+ */
+ if (spdk_unlikely(i > SPDK_COUNTOF(tr->u.prp))) {
+ SPDK_ERRLOG("out of PRP entries\n");
+ return -EINVAL;
+ }
+
+ phys_addr = spdk_vtophys(virt_addr);
+ if (spdk_unlikely(phys_addr == SPDK_VTOPHYS_ERROR)) {
+ SPDK_ERRLOG("vtophys(%p) failed\n", virt_addr);
+ return -EINVAL;
+ }
+
+ if (i == 0) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp1 = %p\n", (void *)phys_addr);
+ cmd->dptr.prp.prp1 = phys_addr;
+ seg_len = page_size - ((uintptr_t)virt_addr & page_mask);
+ } else {
+ if ((phys_addr & page_mask) != 0) {
+ SPDK_ERRLOG("PRP %u not page aligned (%p)\n", i, virt_addr);
+ return -EINVAL;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp[%u] = %p\n", i - 1, (void *)phys_addr);
+ tr->u.prp[i - 1] = phys_addr;
+ seg_len = page_size;
+ }
+
+ seg_len = spdk_min(seg_len, len);
+ virt_addr += seg_len;
+ len -= seg_len;
+ i++;
+ }
+
+ cmd->psdt = SPDK_NVME_PSDT_PRP;
+ if (i <= 1) {
+ cmd->dptr.prp.prp2 = 0;
+ } else if (i == 2) {
+ cmd->dptr.prp.prp2 = tr->u.prp[0];
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p\n", (void *)cmd->dptr.prp.prp2);
+ } else {
+ cmd->dptr.prp.prp2 = tr->prp_sgl_bus_addr;
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "prp2 = %p (PRP list)\n", (void *)cmd->dptr.prp.prp2);
+ }
+
+ *prp_index = i;
+ return 0;
+}
+
+/**
+ * Build PRP list describing physically contiguous payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_contig_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr)
+{
+ uint32_t prp_index = 0;
+ int rc;
+
+ rc = nvme_pcie_prp_list_append(tr, &prp_index, req->payload.contig_or_cb_arg + req->payload_offset,
+ req->payload_size, qpair->ctrlr->page_size);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return rc;
+ }
+
+ return 0;
+}
+
+#define _2MB_OFFSET(ptr) (((uintptr_t)(ptr)) & (0x200000 - 1))
+
+/**
+ * Build SGL list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_hw_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr)
+{
+ int rc;
+ void *virt_addr;
+ uint64_t phys_addr;
+ uint32_t remaining_transfer_len, remaining_user_sge_len, length;
+ struct spdk_nvme_sgl_descriptor *sgl;
+ uint32_t nseg = 0;
+
+ /*
+ * Build scattered payloads.
+ */
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ sgl = tr->u.sgl;
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.subtype = 0;
+
+ remaining_transfer_len = req->payload_size;
+
+ while (remaining_transfer_len > 0) {
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg,
+ &virt_addr, &remaining_user_sge_len);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -1;
+ }
+
+ remaining_user_sge_len = spdk_min(remaining_user_sge_len, remaining_transfer_len);
+ remaining_transfer_len -= remaining_user_sge_len;
+ while (remaining_user_sge_len > 0) {
+ if (nseg >= NVME_MAX_SGL_DESCRIPTORS) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -1;
+ }
+
+ phys_addr = spdk_vtophys(virt_addr);
+ if (phys_addr == SPDK_VTOPHYS_ERROR) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -1;
+ }
+
+ length = spdk_min(remaining_user_sge_len, 0x200000 - _2MB_OFFSET(virt_addr));
+ remaining_user_sge_len -= length;
+ virt_addr += length;
+
+ if (nseg > 0 && phys_addr ==
+ (*(sgl - 1)).address + (*(sgl - 1)).unkeyed.length) {
+ /* extend previous entry */
+ (*(sgl - 1)).unkeyed.length += length;
+ continue;
+ }
+
+ sgl->unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ sgl->unkeyed.length = length;
+ sgl->address = phys_addr;
+ sgl->unkeyed.subtype = 0;
+
+ sgl++;
+ nseg++;
+ }
+ }
+
+ if (nseg == 1) {
+ /*
+ * The whole transfer can be described by a single SGL descriptor.
+ * Use the special case described by the spec where SGL1's type is Data Block.
+ * This means the SGL in the tracker is not used at all, so copy the first (and only)
+ * SGL element into SGL1.
+ */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.address = tr->u.sgl[0].address;
+ req->cmd.dptr.sgl1.unkeyed.length = tr->u.sgl[0].unkeyed.length;
+ } else {
+ /* For now we can only support 1 SGL segment in NVMe controller */
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+ req->cmd.dptr.sgl1.address = tr->prp_sgl_bus_addr;
+ req->cmd.dptr.sgl1.unkeyed.length = nseg * sizeof(struct spdk_nvme_sgl_descriptor);
+ }
+
+ return 0;
+}
+
+/**
+ * Build PRP list describing scattered payload buffer.
+ */
+static int
+nvme_pcie_qpair_build_prps_sgl_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req,
+ struct nvme_tracker *tr)
+{
+ int rc;
+ void *virt_addr;
+ uint32_t remaining_transfer_len, length;
+ uint32_t prp_index = 0;
+ uint32_t page_size = qpair->ctrlr->page_size;
+
+ /*
+ * Build scattered payloads.
+ */
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ remaining_transfer_len = req->payload_size;
+ while (remaining_transfer_len > 0) {
+ assert(req->payload.next_sge_fn != NULL);
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return -1;
+ }
+
+ length = spdk_min(remaining_transfer_len, length);
+
+ /*
+ * Any incompatible sges should have been handled up in the splitting routine,
+ * but assert here as an additional check.
+ *
+ * All SGEs except last must end on a page boundary.
+ */
+ assert((length == remaining_transfer_len) ||
+ _is_page_aligned((uintptr_t)virt_addr + length, page_size));
+
+ rc = nvme_pcie_prp_list_append(tr, &prp_index, virt_addr, length, page_size);
+ if (rc) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ return rc;
+ }
+
+ remaining_transfer_len -= length;
+ }
+
+ return 0;
+}
+
+static inline bool
+nvme_pcie_qpair_check_enabled(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ if (!pqpair->is_enabled &&
+ !qpair->ctrlr->is_resetting) {
+ nvme_qpair_enable(qpair);
+ }
+ return pqpair->is_enabled;
+}
+
+int
+nvme_pcie_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ struct nvme_tracker *tr;
+ int rc = 0;
+ void *md_payload;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+
+ nvme_pcie_qpair_check_enabled(qpair);
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ }
+
+ tr = TAILQ_FIRST(&pqpair->free_tr);
+
+ if (tr == NULL || !pqpair->is_enabled) {
+ /*
+ * No tracker is available, or the qpair is disabled due to
+ * an in-progress controller-level reset.
+ *
+ * Put the request on the qpair's request queue to be
+ * processed when a tracker frees up via a command
+ * completion or when the controller reset is
+ * completed.
+ */
+ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+ goto exit;
+ }
+
+ TAILQ_REMOVE(&pqpair->free_tr, tr, tq_list); /* remove tr from free_tr */
+ TAILQ_INSERT_TAIL(&pqpair->outstanding_tr, tr, tq_list);
+ tr->req = req;
+ req->cmd.cid = tr->cid;
+
+ if (req->payload_size && req->payload.md) {
+ md_payload = req->payload.md + req->md_offset;
+ tr->req->cmd.mptr = spdk_vtophys(md_payload);
+ if (tr->req->cmd.mptr == SPDK_VTOPHYS_ERROR) {
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ rc = -EINVAL;
+ goto exit;
+ }
+ }
+
+ if (req->payload_size == 0) {
+ /* Null payload - leave PRP fields zeroed */
+ rc = 0;
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
+ rc = nvme_pcie_qpair_build_contig_request(qpair, req, tr);
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
+ if (ctrlr->flags & SPDK_NVME_CTRLR_SGL_SUPPORTED) {
+ rc = nvme_pcie_qpair_build_hw_sgl_request(qpair, req, tr);
+ } else {
+ rc = nvme_pcie_qpair_build_prps_sgl_request(qpair, req, tr);
+ }
+ } else {
+ assert(0);
+ nvme_pcie_fail_request_bad_vtophys(qpair, tr);
+ rc = -EINVAL;
+ }
+
+ if (rc < 0) {
+ goto exit;
+ }
+
+ nvme_pcie_qpair_submit_tracker(qpair, tr);
+
+exit:
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ }
+
+ return rc;
+}
+
+static void
+nvme_pcie_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+ uint64_t t02;
+ struct nvme_tracker *tr, *tmp;
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /* Don't check timeouts during controller initialization. */
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ } else {
+ active_proc = qpair->active_proc;
+ }
+
+ /* Only check timeouts if the current process has a timeout callback. */
+ if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+ return;
+ }
+
+ t02 = spdk_get_ticks();
+ TAILQ_FOREACH_SAFE(tr, &pqpair->outstanding_tr, tq_list, tmp) {
+ assert(tr->req != NULL);
+
+ if (nvme_request_check_timeout(tr->req, tr->cid, active_proc, t02)) {
+ /*
+ * The requests are in order, so as soon as one has not timed out,
+ * stop iterating.
+ */
+ break;
+ }
+ }
+}
+
+int32_t
+nvme_pcie_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ struct nvme_pcie_qpair *pqpair = nvme_pcie_qpair(qpair);
+ struct nvme_pcie_ctrlr *pctrlr = nvme_pcie_ctrlr(qpair->ctrlr);
+ struct nvme_tracker *tr;
+ struct spdk_nvme_cpl *cpl;
+ uint32_t num_completions = 0;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+
+ if (spdk_unlikely(!nvme_pcie_qpair_check_enabled(qpair))) {
+ /*
+ * qpair is not enabled, likely because a controller reset is
+ * is in progress. Ignore the interrupt - any I/O that was
+ * associated with this interrupt will get retried when the
+ * reset is complete.
+ */
+ return 0;
+ }
+
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_robust_mutex_lock(&ctrlr->ctrlr_lock);
+ }
+
+ if (max_completions == 0 || max_completions > pqpair->max_completions_cap) {
+ /*
+ * max_completions == 0 means unlimited, but complete at most
+ * max_completions_cap batch of I/O at a time so that the completion
+ * queue doorbells don't wrap around.
+ */
+ max_completions = pqpair->max_completions_cap;
+ }
+
+ while (1) {
+ cpl = &pqpair->cpl[pqpair->cq_head];
+
+ if (cpl->status.p != pqpair->phase) {
+ break;
+ }
+#ifdef __PPC64__
+ /*
+ * This memory barrier prevents reordering of:
+ * - load after store from/to tr
+ * - load after load cpl phase and cpl cid
+ */
+ spdk_mb();
+#endif
+
+ if (spdk_unlikely(++pqpair->cq_head == pqpair->num_entries)) {
+ pqpair->cq_head = 0;
+ pqpair->phase = !pqpair->phase;
+ }
+
+ tr = &pqpair->tr[cpl->cid];
+ pqpair->sq_head = cpl->sqhd;
+
+ if (tr->active) {
+ nvme_pcie_qpair_complete_tracker(qpair, tr, cpl, true);
+ } else {
+ SPDK_ERRLOG("cpl does not map to outstanding cmd\n");
+ nvme_qpair_print_completion(qpair, cpl);
+ assert(0);
+ }
+
+ if (++num_completions == max_completions) {
+ break;
+ }
+ }
+
+ if (num_completions > 0) {
+ if (spdk_likely(nvme_pcie_qpair_update_mmio_required(qpair, pqpair->cq_head,
+ pqpair->cq_shadow_hdbl,
+ pqpair->cq_eventidx))) {
+ g_thread_mmio_ctrlr = pctrlr;
+ spdk_mmio_write_4(pqpair->cq_hdbl, pqpair->cq_head);
+ g_thread_mmio_ctrlr = NULL;
+ }
+ }
+
+ if (spdk_unlikely(ctrlr->timeout_enabled)) {
+ /*
+ * User registered for timeout callback
+ */
+ nvme_pcie_qpair_check_timeout(qpair);
+ }
+
+ /* Before returning, complete any pending admin request. */
+ if (spdk_unlikely(nvme_qpair_is_admin_queue(qpair))) {
+ nvme_pcie_qpair_complete_pending_admin_request(qpair);
+
+ nvme_robust_mutex_unlock(&ctrlr->ctrlr_lock);
+ }
+
+ return num_completions;
+}
diff --git a/src/spdk/lib/nvme/nvme_qpair.c b/src/spdk/lib/nvme/nvme_qpair.c
new file mode 100644
index 00000000..9f585798
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_qpair.c
@@ -0,0 +1,663 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+#include "spdk/nvme_ocssd.h"
+
+static void nvme_qpair_fail(struct spdk_nvme_qpair *qpair);
+
+struct nvme_string {
+ uint16_t value;
+ const char *str;
+};
+
+static const struct nvme_string admin_opcode[] = {
+ { SPDK_NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" },
+ { SPDK_NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" },
+ { SPDK_NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" },
+ { SPDK_NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" },
+ { SPDK_NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" },
+ { SPDK_NVME_OPC_IDENTIFY, "IDENTIFY" },
+ { SPDK_NVME_OPC_ABORT, "ABORT" },
+ { SPDK_NVME_OPC_SET_FEATURES, "SET FEATURES" },
+ { SPDK_NVME_OPC_GET_FEATURES, "GET FEATURES" },
+ { SPDK_NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" },
+ { SPDK_NVME_OPC_NS_MANAGEMENT, "NAMESPACE MANAGEMENT" },
+ { SPDK_NVME_OPC_FIRMWARE_COMMIT, "FIRMWARE COMMIT" },
+ { SPDK_NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" },
+ { SPDK_NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" },
+ { SPDK_NVME_OPC_NS_ATTACHMENT, "NAMESPACE ATTACHMENT" },
+ { SPDK_NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" },
+ { SPDK_NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" },
+ { SPDK_NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" },
+ { SPDK_NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" },
+ { SPDK_NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" },
+ { SPDK_NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" },
+ { SPDK_NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" },
+ { SPDK_NVME_OPC_FORMAT_NVM, "FORMAT NVM" },
+ { SPDK_NVME_OPC_SECURITY_SEND, "SECURITY SEND" },
+ { SPDK_NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" },
+ { SPDK_NVME_OPC_SANITIZE, "SANITIZE" },
+ { SPDK_OCSSD_OPC_GEOMETRY, "OCSSD / GEOMETRY" },
+ { 0xFFFF, "ADMIN COMMAND" }
+};
+
+static const struct nvme_string io_opcode[] = {
+ { SPDK_NVME_OPC_FLUSH, "FLUSH" },
+ { SPDK_NVME_OPC_WRITE, "WRITE" },
+ { SPDK_NVME_OPC_READ, "READ" },
+ { SPDK_NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" },
+ { SPDK_NVME_OPC_COMPARE, "COMPARE" },
+ { SPDK_NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" },
+ { SPDK_NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" },
+ { SPDK_NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" },
+ { SPDK_NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" },
+ { SPDK_NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" },
+ { SPDK_NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" },
+ { SPDK_OCSSD_OPC_VECTOR_RESET, "OCSSD / VECTOR RESET" },
+ { SPDK_OCSSD_OPC_VECTOR_WRITE, "OCSSD / VECTOR WRITE" },
+ { SPDK_OCSSD_OPC_VECTOR_READ, "OCSSD / VECTOR READ" },
+ { SPDK_OCSSD_OPC_VECTOR_COPY, "OCSSD / VECTOR COPY" },
+ { 0xFFFF, "IO COMMAND" }
+};
+
+static const char *
+nvme_get_string(const struct nvme_string *strings, uint16_t value)
+{
+ const struct nvme_string *entry;
+
+ entry = strings;
+
+ while (entry->value != 0xFFFF) {
+ if (entry->value == value) {
+ return entry->str;
+ }
+ entry++;
+ }
+ return entry->str;
+}
+
+static void
+nvme_admin_qpair_print_command(struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd)
+{
+
+ SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%x "
+ "cdw10:%08x cdw11:%08x\n",
+ nvme_get_string(admin_opcode, cmd->opc), cmd->opc, qpair->id, cmd->cid,
+ cmd->nsid, cmd->cdw10, cmd->cdw11);
+}
+
+static void
+nvme_io_qpair_print_command(struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cmd *cmd)
+{
+ assert(qpair != NULL);
+ assert(cmd != NULL);
+ switch ((int)cmd->opc) {
+ case SPDK_NVME_OPC_WRITE:
+ case SPDK_NVME_OPC_READ:
+ case SPDK_NVME_OPC_WRITE_UNCORRECTABLE:
+ case SPDK_NVME_OPC_COMPARE:
+ SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d "
+ "lba:%llu len:%d\n",
+ nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid,
+ cmd->nsid,
+ ((unsigned long long)cmd->cdw11 << 32) + cmd->cdw10,
+ (cmd->cdw12 & 0xFFFF) + 1);
+ break;
+ case SPDK_NVME_OPC_FLUSH:
+ case SPDK_NVME_OPC_DATASET_MANAGEMENT:
+ SPDK_NOTICELOG("%s sqid:%d cid:%d nsid:%d\n",
+ nvme_get_string(io_opcode, cmd->opc), qpair->id, cmd->cid,
+ cmd->nsid);
+ break;
+ default:
+ SPDK_NOTICELOG("%s (%02x) sqid:%d cid:%d nsid:%d\n",
+ nvme_get_string(io_opcode, cmd->opc), cmd->opc, qpair->id,
+ cmd->cid, cmd->nsid);
+ break;
+ }
+}
+
+void
+nvme_qpair_print_command(struct spdk_nvme_qpair *qpair, struct spdk_nvme_cmd *cmd)
+{
+ assert(qpair != NULL);
+ assert(cmd != NULL);
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ nvme_admin_qpair_print_command(qpair, cmd);
+ } else {
+ nvme_io_qpair_print_command(qpair, cmd);
+ }
+}
+
+static const struct nvme_string generic_status[] = {
+ { SPDK_NVME_SC_SUCCESS, "SUCCESS" },
+ { SPDK_NVME_SC_INVALID_OPCODE, "INVALID OPCODE" },
+ { SPDK_NVME_SC_INVALID_FIELD, "INVALID FIELD" },
+ { SPDK_NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" },
+ { SPDK_NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" },
+ { SPDK_NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" },
+ { SPDK_NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" },
+ { SPDK_NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" },
+ { SPDK_NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" },
+ { SPDK_NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" },
+ { SPDK_NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" },
+ { SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" },
+ { SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" },
+ { SPDK_NVME_SC_INVALID_SGL_SEG_DESCRIPTOR, "INVALID SGL SEGMENT DESCRIPTOR" },
+ { SPDK_NVME_SC_INVALID_NUM_SGL_DESCIRPTORS, "INVALID NUMBER OF SGL DESCRIPTORS" },
+ { SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" },
+ { SPDK_NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" },
+ { SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" },
+ { SPDK_NVME_SC_INVALID_CONTROLLER_MEM_BUF, "INVALID CONTROLLER MEMORY BUFFER" },
+ { SPDK_NVME_SC_INVALID_PRP_OFFSET, "INVALID PRP OFFSET" },
+ { SPDK_NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" },
+ { SPDK_NVME_SC_OPERATION_DENIED, "OPERATION DENIED" },
+ { SPDK_NVME_SC_INVALID_SGL_OFFSET, "INVALID SGL OFFSET" },
+ { SPDK_NVME_SC_HOSTID_INCONSISTENT_FORMAT, "HOSTID INCONSISTENT FORMAT" },
+ { SPDK_NVME_SC_KEEP_ALIVE_EXPIRED, "KEEP ALIVE EXPIRED" },
+ { SPDK_NVME_SC_KEEP_ALIVE_INVALID, "KEEP ALIVE INVALID" },
+ { SPDK_NVME_SC_ABORTED_PREEMPT, "ABORTED - PREEMPT AND ABORT" },
+ { SPDK_NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" },
+ { SPDK_NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" },
+ { SPDK_NVME_SC_SGL_DATA_BLOCK_GRANULARITY_INVALID, "DATA BLOCK GRANULARITY INVALID" },
+ { SPDK_NVME_SC_COMMAND_INVALID_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" },
+ { SPDK_NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" },
+ { SPDK_NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" },
+ { SPDK_NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" },
+ { SPDK_NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" },
+ { SPDK_NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" },
+ { 0xFFFF, "GENERIC" }
+};
+
+static const struct nvme_string command_specific_status[] = {
+ { SPDK_NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" },
+ { SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" },
+ { SPDK_NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" },
+ { SPDK_NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" },
+ { SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" },
+ { SPDK_NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" },
+ { SPDK_NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" },
+ { SPDK_NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" },
+ { SPDK_NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" },
+ { SPDK_NVME_SC_INVALID_FORMAT, "INVALID FORMAT" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_CONVENTIONAL_RESET, "FIRMWARE REQUIRES CONVENTIONAL RESET" },
+ { SPDK_NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" },
+ { SPDK_NVME_SC_FEATURE_ID_NOT_SAVEABLE, "FEATURE ID NOT SAVEABLE" },
+ { SPDK_NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" },
+ { SPDK_NVME_SC_FEATURE_NOT_NAMESPACE_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_NVM_RESET, "FIRMWARE REQUIRES NVM RESET" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_RESET, "FIRMWARE REQUIRES RESET" },
+ { SPDK_NVME_SC_FIRMWARE_REQ_MAX_TIME_VIOLATION, "FIRMWARE REQUIRES MAX TIME VIOLATION" },
+ { SPDK_NVME_SC_FIRMWARE_ACTIVATION_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" },
+ { SPDK_NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" },
+ { SPDK_NVME_SC_NAMESPACE_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" },
+ { SPDK_NVME_SC_NAMESPACE_ID_UNAVAILABLE, "NAMESPACE ID UNAVAILABLE" },
+ { SPDK_NVME_SC_NAMESPACE_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" },
+ { SPDK_NVME_SC_NAMESPACE_IS_PRIVATE, "NAMESPACE IS PRIVATE" },
+ { SPDK_NVME_SC_NAMESPACE_NOT_ATTACHED, "NAMESPACE NOT ATTACHED" },
+ { SPDK_NVME_SC_THINPROVISIONING_NOT_SUPPORTED, "THINPROVISIONING NOT SUPPORTED" },
+ { SPDK_NVME_SC_CONTROLLER_LIST_INVALID, "CONTROLLER LIST INVALID" },
+ { SPDK_NVME_SC_DEVICE_SELF_TEST_IN_PROGRESS, "DEVICE SELF-TEST IN PROGRESS" },
+ { SPDK_NVME_SC_BOOT_PARTITION_WRITE_PROHIBITED, "BOOT PARTITION WRITE PROHIBITED" },
+ { SPDK_NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER ID" },
+ { SPDK_NVME_SC_INVALID_SECONDARY_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" },
+ { SPDK_NVME_SC_INVALID_NUM_CTRLR_RESOURCES, "INVALID NUMBER OF CONTROLLER RESOURCES" },
+ { SPDK_NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" },
+ { SPDK_NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" },
+ { SPDK_NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" },
+ { SPDK_NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" },
+ { 0xFFFF, "COMMAND SPECIFIC" }
+};
+
+static const struct nvme_string media_error_status[] = {
+ { SPDK_NVME_SC_WRITE_FAULTS, "WRITE FAULTS" },
+ { SPDK_NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" },
+ { SPDK_NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" },
+ { SPDK_NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" },
+ { SPDK_NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" },
+ { SPDK_NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" },
+ { SPDK_NVME_SC_ACCESS_DENIED, "ACCESS DENIED" },
+ { SPDK_NVME_SC_DEALLOCATED_OR_UNWRITTEN_BLOCK, "DEALLOCATED OR UNWRITTEN BLOCK" },
+ { SPDK_OCSSD_SC_OFFLINE_CHUNK, "RESET OFFLINE CHUNK" },
+ { SPDK_OCSSD_SC_INVALID_RESET, "INVALID RESET" },
+ { SPDK_OCSSD_SC_WRITE_FAIL_WRITE_NEXT_UNIT, "WRITE FAIL WRITE NEXT UNIT" },
+ { SPDK_OCSSD_SC_WRITE_FAIL_CHUNK_EARLY_CLOSE, "WRITE FAIL CHUNK EARLY CLOSE" },
+ { SPDK_OCSSD_SC_OUT_OF_ORDER_WRITE, "OUT OF ORDER WRITE" },
+ { SPDK_OCSSD_SC_READ_HIGH_ECC, "READ HIGH ECC" },
+ { 0xFFFF, "MEDIA ERROR" }
+};
+
+static const struct nvme_string path_status[] = {
+ { SPDK_NVME_SC_INTERNAL_PATH_ERROR, "INTERNAL PATH ERROR" },
+ { SPDK_NVME_SC_CONTROLLER_PATH_ERROR, "CONTROLLER PATH ERROR" },
+ { SPDK_NVME_SC_HOST_PATH_ERROR, "HOST PATH ERROR" },
+ { SPDK_NVME_SC_ABORTED_BY_HOST, "ABORTED BY HOST" },
+ { 0xFFFF, "PATH ERROR" }
+};
+
+static const char *
+get_status_string(uint16_t sct, uint16_t sc)
+{
+ const struct nvme_string *entry;
+
+ switch (sct) {
+ case SPDK_NVME_SCT_GENERIC:
+ entry = generic_status;
+ break;
+ case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+ entry = command_specific_status;
+ break;
+ case SPDK_NVME_SCT_MEDIA_ERROR:
+ entry = media_error_status;
+ break;
+ case SPDK_NVME_SCT_PATH:
+ entry = path_status;
+ break;
+ case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+ return "VENDOR SPECIFIC";
+ default:
+ return "RESERVED";
+ }
+
+ return nvme_get_string(entry, sc);
+}
+
+void
+nvme_qpair_print_completion(struct spdk_nvme_qpair *qpair,
+ struct spdk_nvme_cpl *cpl)
+{
+ SPDK_NOTICELOG("%s (%02x/%02x) sqid:%d cid:%d cdw0:%x sqhd:%04x p:%x m:%x dnr:%x\n",
+ get_status_string(cpl->status.sct, cpl->status.sc),
+ cpl->status.sct, cpl->status.sc, cpl->sqid, cpl->cid, cpl->cdw0,
+ cpl->sqhd, cpl->status.p, cpl->status.m, cpl->status.dnr);
+}
+
+bool
+nvme_completion_is_retry(const struct spdk_nvme_cpl *cpl)
+{
+ /*
+ * TODO: spec is not clear how commands that are aborted due
+ * to TLER will be marked. So for now, it seems
+ * NAMESPACE_NOT_READY is the only case where we should
+ * look at the DNR bit.
+ */
+ switch ((int)cpl->status.sct) {
+ case SPDK_NVME_SCT_GENERIC:
+ switch ((int)cpl->status.sc) {
+ case SPDK_NVME_SC_NAMESPACE_NOT_READY:
+ case SPDK_NVME_SC_FORMAT_IN_PROGRESS:
+ if (cpl->status.dnr) {
+ return false;
+ } else {
+ return true;
+ }
+ case SPDK_NVME_SC_INVALID_OPCODE:
+ case SPDK_NVME_SC_INVALID_FIELD:
+ case SPDK_NVME_SC_COMMAND_ID_CONFLICT:
+ case SPDK_NVME_SC_DATA_TRANSFER_ERROR:
+ case SPDK_NVME_SC_ABORTED_POWER_LOSS:
+ case SPDK_NVME_SC_INTERNAL_DEVICE_ERROR:
+ case SPDK_NVME_SC_ABORTED_BY_REQUEST:
+ case SPDK_NVME_SC_ABORTED_SQ_DELETION:
+ case SPDK_NVME_SC_ABORTED_FAILED_FUSED:
+ case SPDK_NVME_SC_ABORTED_MISSING_FUSED:
+ case SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT:
+ case SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR:
+ case SPDK_NVME_SC_LBA_OUT_OF_RANGE:
+ case SPDK_NVME_SC_CAPACITY_EXCEEDED:
+ default:
+ return false;
+ }
+ case SPDK_NVME_SCT_PATH:
+ /*
+ * Per NVMe TP 4028 (Path and Transport Error Enhancements), retries should be
+ * based on the setting of the DNR bit for Internal Path Error
+ */
+ switch ((int)cpl->status.sc) {
+ case SPDK_NVME_SC_INTERNAL_PATH_ERROR:
+ return !cpl->status.dnr;
+ default:
+ return false;
+ }
+ case SPDK_NVME_SCT_COMMAND_SPECIFIC:
+ case SPDK_NVME_SCT_MEDIA_ERROR:
+ case SPDK_NVME_SCT_VENDOR_SPECIFIC:
+ default:
+ return false;
+ }
+}
+
+static void
+nvme_qpair_manual_complete_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req, uint32_t sct, uint32_t sc,
+ bool print_on_error)
+{
+ struct spdk_nvme_cpl cpl;
+ bool error;
+
+ memset(&cpl, 0, sizeof(cpl));
+ cpl.sqid = qpair->id;
+ cpl.status.sct = sct;
+ cpl.status.sc = sc;
+
+ error = spdk_nvme_cpl_is_error(&cpl);
+
+ if (error && print_on_error) {
+ SPDK_NOTICELOG("Command completed manually:\n");
+ nvme_qpair_print_command(qpair, &req->cmd);
+ nvme_qpair_print_completion(qpair, &cpl);
+ }
+
+ nvme_complete_request(req, &cpl);
+ nvme_free_request(req);
+}
+
+int32_t
+spdk_nvme_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ int32_t ret;
+ struct nvme_request *req, *tmp;
+
+ if (qpair->ctrlr->is_failed) {
+ nvme_qpair_fail(qpair);
+ return 0;
+ }
+
+ /* error injection for those queued error requests */
+ if (spdk_unlikely(!STAILQ_EMPTY(&qpair->err_req_head))) {
+ STAILQ_FOREACH_SAFE(req, &qpair->err_req_head, stailq, tmp) {
+ if (spdk_get_ticks() - req->submit_tick > req->timeout_tsc) {
+ STAILQ_REMOVE(&qpair->err_req_head, req, nvme_request, stailq);
+ nvme_qpair_manual_complete_request(qpair, req,
+ req->cpl.status.sct,
+ req->cpl.status.sc, true);
+ }
+ }
+ }
+
+ qpair->in_completion_context = 1;
+ ret = nvme_transport_qpair_process_completions(qpair, max_completions);
+ qpair->in_completion_context = 0;
+ if (qpair->delete_after_completion_context) {
+ /*
+ * A request to delete this qpair was made in the context of this completion
+ * routine - so it is safe to delete it now.
+ */
+ spdk_nvme_ctrlr_free_io_qpair(qpair);
+ }
+ return ret;
+}
+
+int
+nvme_qpair_init(struct spdk_nvme_qpair *qpair, uint16_t id,
+ struct spdk_nvme_ctrlr *ctrlr,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests)
+{
+ size_t req_size_padded;
+ uint32_t i;
+
+ qpair->id = id;
+ qpair->qprio = qprio;
+
+ qpair->in_completion_context = 0;
+ qpair->delete_after_completion_context = 0;
+ qpair->no_deletion_notification_needed = 0;
+
+ qpair->ctrlr = ctrlr;
+ qpair->trtype = ctrlr->trid.trtype;
+
+ STAILQ_INIT(&qpair->free_req);
+ STAILQ_INIT(&qpair->queued_req);
+ TAILQ_INIT(&qpair->err_cmd_head);
+ STAILQ_INIT(&qpair->err_req_head);
+
+ req_size_padded = (sizeof(struct nvme_request) + 63) & ~(size_t)63;
+
+ qpair->req_buf = spdk_zmalloc(req_size_padded * num_requests, 64, NULL,
+ SPDK_ENV_SOCKET_ID_ANY, SPDK_MALLOC_SHARE);
+ if (qpair->req_buf == NULL) {
+ SPDK_ERRLOG("no memory to allocate qpair(cntlid:0x%x sqid:%d) req_buf with %d request\n",
+ ctrlr->cntlid, qpair->id, num_requests);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < num_requests; i++) {
+ struct nvme_request *req = qpair->req_buf + i * req_size_padded;
+
+ STAILQ_INSERT_HEAD(&qpair->free_req, req, stailq);
+ }
+
+ return 0;
+}
+
+void
+nvme_qpair_deinit(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+ struct nvme_error_cmd *cmd, *entry;
+
+ while (!STAILQ_EMPTY(&qpair->err_req_head)) {
+ req = STAILQ_FIRST(&qpair->err_req_head);
+ STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq);
+ nvme_qpair_manual_complete_request(qpair, req,
+ req->cpl.status.sct,
+ req->cpl.status.sc, true);
+ }
+
+ TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+ TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+ spdk_dma_free(cmd);
+ }
+
+ spdk_dma_free(qpair->req_buf);
+}
+
+int
+nvme_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ int rc = 0;
+ struct nvme_request *child_req, *tmp;
+ struct nvme_error_cmd *cmd;
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ bool child_req_failed = false;
+
+ if (ctrlr->is_failed) {
+ nvme_free_request(req);
+ return -ENXIO;
+ }
+
+ if (req->num_children) {
+ /*
+ * This is a split (parent) request. Submit all of the children but not the parent
+ * request itself, since the parent is the original unsplit request.
+ */
+ TAILQ_FOREACH_SAFE(child_req, &req->children, child_tailq, tmp) {
+ if (!child_req_failed) {
+ rc = nvme_qpair_submit_request(qpair, child_req);
+ if (rc != 0) {
+ child_req_failed = true;
+ }
+ } else { /* free remaining child_reqs since one child_req fails */
+ nvme_request_remove_child(req, child_req);
+ nvme_free_request(child_req);
+ }
+ }
+
+ return rc;
+ }
+
+ /* queue those requests which matches with opcode in err_cmd list */
+ if (spdk_unlikely(!TAILQ_EMPTY(&qpair->err_cmd_head))) {
+ TAILQ_FOREACH(cmd, &qpair->err_cmd_head, link) {
+ if (!cmd->do_not_submit) {
+ continue;
+ }
+
+ if ((cmd->opc == req->cmd.opc) && cmd->err_count) {
+ /* add to error request list and set cpl */
+ req->timeout_tsc = cmd->timeout_tsc;
+ req->submit_tick = spdk_get_ticks();
+ req->cpl.status.sct = cmd->status.sct;
+ req->cpl.status.sc = cmd->status.sc;
+ STAILQ_INSERT_TAIL(&qpair->err_req_head, req, stailq);
+ cmd->err_count--;
+ return 0;
+ }
+ }
+ }
+
+ return nvme_transport_qpair_submit_request(qpair, req);
+}
+
+static void
+_nvme_io_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ /* Manually abort each queued I/O. */
+ while (!STAILQ_EMPTY(&qpair->queued_req)) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ SPDK_ERRLOG("aborting queued i/o\n");
+ nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, true);
+ }
+}
+
+void
+nvme_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ if (nvme_qpair_is_io_queue(qpair)) {
+ _nvme_io_qpair_enable(qpair);
+ }
+
+ nvme_transport_qpair_enable(qpair);
+}
+
+void
+nvme_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ while (!STAILQ_EMPTY(&qpair->err_req_head)) {
+ req = STAILQ_FIRST(&qpair->err_req_head);
+ STAILQ_REMOVE_HEAD(&qpair->err_req_head, stailq);
+ nvme_qpair_manual_complete_request(qpair, req,
+ req->cpl.status.sct,
+ req->cpl.status.sc, true);
+ }
+
+ nvme_transport_qpair_disable(qpair);
+}
+
+static void
+nvme_qpair_fail(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_request *req;
+
+ while (!STAILQ_EMPTY(&qpair->queued_req)) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ SPDK_ERRLOG("failing queued i/o\n");
+ nvme_qpair_manual_complete_request(qpair, req, SPDK_NVME_SCT_GENERIC,
+ SPDK_NVME_SC_ABORTED_BY_REQUEST, true);
+ }
+
+ nvme_transport_qpair_fail(qpair);
+}
+
+int
+spdk_nvme_qpair_add_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ uint8_t opc, bool do_not_submit,
+ uint64_t timeout_in_us,
+ uint32_t err_count,
+ uint8_t sct, uint8_t sc)
+{
+ struct nvme_error_cmd *entry, *cmd = NULL;
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ TAILQ_FOREACH(entry, &qpair->err_cmd_head, link) {
+ if (entry->opc == opc) {
+ cmd = entry;
+ break;
+ }
+ }
+
+ if (cmd == NULL) {
+ cmd = spdk_dma_zmalloc(sizeof(*cmd), 64, NULL);
+ if (!cmd) {
+ return -ENOMEM;
+ }
+ TAILQ_INSERT_TAIL(&qpair->err_cmd_head, cmd, link);
+ }
+
+ cmd->do_not_submit = do_not_submit;
+ cmd->err_count = err_count;
+ cmd->timeout_tsc = timeout_in_us * spdk_get_ticks_hz() / 1000000ULL;
+ cmd->opc = opc;
+ cmd->status.sct = sct;
+ cmd->status.sc = sc;
+
+ return 0;
+}
+
+void
+spdk_nvme_qpair_remove_cmd_error_injection(struct spdk_nvme_ctrlr *ctrlr,
+ struct spdk_nvme_qpair *qpair,
+ uint8_t opc)
+{
+ struct nvme_error_cmd *cmd, *entry;
+
+ if (qpair == NULL) {
+ qpair = ctrlr->adminq;
+ }
+
+ TAILQ_FOREACH_SAFE(cmd, &qpair->err_cmd_head, link, entry) {
+ if (cmd->opc == opc) {
+ TAILQ_REMOVE(&qpair->err_cmd_head, cmd, link);
+ spdk_dma_free(cmd);
+ return;
+ }
+ }
+
+ return;
+}
diff --git a/src/spdk/lib/nvme/nvme_quirks.c b/src/spdk/lib/nvme/nvme_quirks.c
new file mode 100644
index 00000000..9a213b12
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_quirks.c
@@ -0,0 +1,141 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "nvme_internal.h"
+
+struct nvme_quirk {
+ struct spdk_pci_id id;
+ uint64_t flags;
+};
+
+static const struct nvme_quirk nvme_quirks[] = {
+ { {SPDK_PCI_VID_INTEL, 0x0953, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE
+ },
+ { {SPDK_PCI_VID_INTEL, 0x0A53, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE
+ },
+ { {SPDK_PCI_VID_INTEL, 0x0A54, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE
+ },
+ { {SPDK_PCI_VID_INTEL, 0x0A55, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_INTEL_QUIRK_READ_LATENCY |
+ NVME_INTEL_QUIRK_WRITE_LATENCY |
+ NVME_INTEL_QUIRK_STRIPING |
+ NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE
+ },
+ { {SPDK_PCI_VID_MEMBLAZE, 0x0540, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_VID_SAMSUNG, 0xa821, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_VID_SAMSUNG, 0xa822, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_BEFORE_CHK_RDY
+ },
+ { {SPDK_PCI_VID_VIRTUALBOX, 0x4e56, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC
+ },
+ { {SPDK_PCI_VID_INTEL, 0x5845, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_IDENTIFY_CNS |
+ NVME_INTEL_QUIRK_NO_LOG_PAGES
+ },
+ { {SPDK_PCI_VID_CNEXLABS, 0x1f1f, SPDK_PCI_ANY_ID, SPDK_PCI_ANY_ID},
+ NVME_QUIRK_IDENTIFY_CNS |
+ NVME_QUIRK_OCSSD
+ },
+ { {0x0000, 0x0000, 0x0000, 0x0000}, 0}
+};
+
+/* Compare each field. SPDK_PCI_ANY_ID in s1 matches everything */
+static bool
+pci_id_match(const struct spdk_pci_id *s1, const struct spdk_pci_id *s2)
+{
+ if ((s1->vendor_id == SPDK_PCI_ANY_ID || s1->vendor_id == s2->vendor_id) &&
+ (s1->device_id == SPDK_PCI_ANY_ID || s1->device_id == s2->device_id) &&
+ (s1->subvendor_id == SPDK_PCI_ANY_ID || s1->subvendor_id == s2->subvendor_id) &&
+ (s1->subdevice_id == SPDK_PCI_ANY_ID || s1->subdevice_id == s2->subdevice_id)) {
+ return true;
+ }
+ return false;
+}
+
+uint64_t
+nvme_get_quirks(const struct spdk_pci_id *id)
+{
+ const struct nvme_quirk *quirk = nvme_quirks;
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Searching for %04x:%04x [%04x:%04x]...\n",
+ id->vendor_id, id->device_id,
+ id->subvendor_id, id->subdevice_id);
+
+ while (quirk->id.vendor_id) {
+ if (pci_id_match(&quirk->id, id)) {
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Matched quirk %04x:%04x [%04x:%04x]:\n",
+ quirk->id.vendor_id, quirk->id.device_id,
+ quirk->id.subvendor_id, quirk->id.subdevice_id);
+
+#define PRINT_QUIRK(quirk_flag) \
+ do { \
+ if (quirk->flags & (quirk_flag)) { \
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Quirk enabled: %s\n", #quirk_flag); \
+ } \
+ } while (0)
+
+ PRINT_QUIRK(NVME_INTEL_QUIRK_READ_LATENCY);
+ PRINT_QUIRK(NVME_INTEL_QUIRK_WRITE_LATENCY);
+ PRINT_QUIRK(NVME_QUIRK_DELAY_BEFORE_CHK_RDY);
+ PRINT_QUIRK(NVME_INTEL_QUIRK_STRIPING);
+ PRINT_QUIRK(NVME_QUIRK_DELAY_AFTER_QUEUE_ALLOC);
+ PRINT_QUIRK(NVME_QUIRK_READ_ZERO_AFTER_DEALLOCATE);
+ PRINT_QUIRK(NVME_QUIRK_IDENTIFY_CNS);
+ PRINT_QUIRK(NVME_QUIRK_OCSSD);
+
+ return quirk->flags;
+ }
+ quirk++;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "No quirks found.\n");
+
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_rdma.c b/src/spdk/lib/nvme/nvme_rdma.c
new file mode 100644
index 00000000..b356e3a1
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_rdma.c
@@ -0,0 +1,1634 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe over RDMA transport
+ */
+
+#include "spdk/stdinc.h"
+
+#include <infiniband/verbs.h>
+#include <rdma/rdma_cma.h>
+#include <rdma/rdma_verbs.h>
+
+#include "spdk/assert.h"
+#include "spdk/log.h"
+#include "spdk/trace.h"
+#include "spdk/event.h"
+#include "spdk/queue.h"
+#include "spdk/nvme.h"
+#include "spdk/nvmf_spec.h"
+#include "spdk/string.h"
+#include "spdk/endian.h"
+#include "spdk/likely.h"
+
+#include "nvme_internal.h"
+
+#define NVME_RDMA_TIME_OUT_IN_MS 2000
+#define NVME_RDMA_RW_BUFFER_SIZE 131072
+
+/*
+ * NVME RDMA qpair Resource Defaults
+ */
+#define NVME_RDMA_DEFAULT_TX_SGE 2
+#define NVME_RDMA_DEFAULT_RX_SGE 1
+
+
+/* Max number of NVMe-oF SGL descriptors supported by the host */
+#define NVME_RDMA_MAX_SGL_DESCRIPTORS 16
+struct spdk_nvmf_cmd {
+ struct spdk_nvme_cmd cmd;
+ struct spdk_nvme_sgl_descriptor sgl[NVME_RDMA_MAX_SGL_DESCRIPTORS];
+};
+
+/* Mapping from virtual address to ibv_mr pointer for a protection domain */
+struct spdk_nvme_rdma_mr_map {
+ struct ibv_pd *pd;
+ struct spdk_mem_map *map;
+ uint64_t ref;
+ LIST_ENTRY(spdk_nvme_rdma_mr_map) link;
+};
+
+/* NVMe RDMA transport extensions for spdk_nvme_ctrlr */
+struct nvme_rdma_ctrlr {
+ struct spdk_nvme_ctrlr ctrlr;
+};
+
+/* NVMe RDMA qpair extensions for spdk_nvme_qpair */
+struct nvme_rdma_qpair {
+ struct spdk_nvme_qpair qpair;
+
+ struct rdma_event_channel *cm_channel;
+
+ struct rdma_cm_id *cm_id;
+
+ struct ibv_cq *cq;
+
+ struct spdk_nvme_rdma_req *rdma_reqs;
+
+ uint16_t num_entries;
+
+ /* Parallel arrays of response buffers + response SGLs of size num_entries */
+ struct ibv_sge *rsp_sgls;
+ struct spdk_nvme_cpl *rsps;
+
+ struct ibv_recv_wr *rsp_recv_wrs;
+
+ /* Memory region describing all rsps for this qpair */
+ struct ibv_mr *rsp_mr;
+
+ /*
+ * Array of num_entries NVMe commands registered as RDMA message buffers.
+ * Indexed by rdma_req->id.
+ */
+ struct spdk_nvmf_cmd *cmds;
+
+ /* Memory region describing all cmds for this qpair */
+ struct ibv_mr *cmd_mr;
+
+ struct spdk_nvme_rdma_mr_map *mr_map;
+
+ TAILQ_HEAD(, spdk_nvme_rdma_req) free_reqs;
+ TAILQ_HEAD(, spdk_nvme_rdma_req) outstanding_reqs;
+};
+
+struct spdk_nvme_rdma_req {
+ int id;
+
+ struct ibv_send_wr send_wr;
+
+ struct nvme_request *req;
+
+ struct ibv_sge send_sgl[NVME_RDMA_DEFAULT_TX_SGE];
+
+ TAILQ_ENTRY(spdk_nvme_rdma_req) link;
+};
+
+static const char *rdma_cm_event_str[] = {
+ "RDMA_CM_EVENT_ADDR_RESOLVED",
+ "RDMA_CM_EVENT_ADDR_ERROR",
+ "RDMA_CM_EVENT_ROUTE_RESOLVED",
+ "RDMA_CM_EVENT_ROUTE_ERROR",
+ "RDMA_CM_EVENT_CONNECT_REQUEST",
+ "RDMA_CM_EVENT_CONNECT_RESPONSE",
+ "RDMA_CM_EVENT_CONNECT_ERROR",
+ "RDMA_CM_EVENT_UNREACHABLE",
+ "RDMA_CM_EVENT_REJECTED",
+ "RDMA_CM_EVENT_ESTABLISHED",
+ "RDMA_CM_EVENT_DISCONNECTED",
+ "RDMA_CM_EVENT_DEVICE_REMOVAL",
+ "RDMA_CM_EVENT_MULTICAST_JOIN",
+ "RDMA_CM_EVENT_MULTICAST_ERROR",
+ "RDMA_CM_EVENT_ADDR_CHANGE",
+ "RDMA_CM_EVENT_TIMEWAIT_EXIT"
+};
+
+static LIST_HEAD(, spdk_nvme_rdma_mr_map) g_rdma_mr_maps = LIST_HEAD_INITIALIZER(&g_rdma_mr_maps);
+static pthread_mutex_t g_rdma_mr_maps_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static int nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair);
+
+static inline struct nvme_rdma_qpair *
+nvme_rdma_qpair(struct spdk_nvme_qpair *qpair)
+{
+ assert(qpair->trtype == SPDK_NVME_TRANSPORT_RDMA);
+ return SPDK_CONTAINEROF(qpair, struct nvme_rdma_qpair, qpair);
+}
+
+static inline struct nvme_rdma_ctrlr *
+nvme_rdma_ctrlr(struct spdk_nvme_ctrlr *ctrlr)
+{
+ assert(ctrlr->trid.trtype == SPDK_NVME_TRANSPORT_RDMA);
+ return SPDK_CONTAINEROF(ctrlr, struct nvme_rdma_ctrlr, ctrlr);
+}
+
+static struct spdk_nvme_rdma_req *
+nvme_rdma_req_get(struct nvme_rdma_qpair *rqpair)
+{
+ struct spdk_nvme_rdma_req *rdma_req;
+
+ rdma_req = TAILQ_FIRST(&rqpair->free_reqs);
+ if (rdma_req) {
+ TAILQ_REMOVE(&rqpair->free_reqs, rdma_req, link);
+ TAILQ_INSERT_TAIL(&rqpair->outstanding_reqs, rdma_req, link);
+ }
+
+ return rdma_req;
+}
+
+static void
+nvme_rdma_req_put(struct nvme_rdma_qpair *rqpair, struct spdk_nvme_rdma_req *rdma_req)
+{
+ TAILQ_REMOVE(&rqpair->outstanding_reqs, rdma_req, link);
+ TAILQ_INSERT_HEAD(&rqpair->free_reqs, rdma_req, link);
+}
+
+static void
+nvme_rdma_req_complete(struct nvme_request *req,
+ struct spdk_nvme_cpl *rsp)
+{
+ nvme_complete_request(req, rsp);
+ nvme_free_request(req);
+}
+
+static const char *
+nvme_rdma_cm_event_str_get(uint32_t event)
+{
+ if (event < SPDK_COUNTOF(rdma_cm_event_str)) {
+ return rdma_cm_event_str[event];
+ } else {
+ return "Undefined";
+ }
+}
+
+static struct rdma_cm_event *
+nvme_rdma_get_event(struct rdma_event_channel *channel,
+ enum rdma_cm_event_type evt)
+{
+ struct rdma_cm_event *event;
+ int rc;
+
+ rc = rdma_get_cm_event(channel, &event);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to get event from CM event channel. Error %d (%s)\n",
+ errno, spdk_strerror(errno));
+ return NULL;
+ }
+
+ if (event->event != evt) {
+ SPDK_ERRLOG("Expected %s but received %s (%d) from CM event channel (status = %d)\n",
+ nvme_rdma_cm_event_str_get(evt),
+ nvme_rdma_cm_event_str_get(event->event), event->event, event->status);
+ rdma_ack_cm_event(event);
+ return NULL;
+ }
+
+ return event;
+}
+
+static int
+nvme_rdma_qpair_init(struct nvme_rdma_qpair *rqpair)
+{
+ int rc;
+ struct ibv_qp_init_attr attr;
+
+ rqpair->cq = ibv_create_cq(rqpair->cm_id->verbs, rqpair->num_entries * 2, rqpair, NULL, 0);
+ if (!rqpair->cq) {
+ SPDK_ERRLOG("Unable to create completion queue: errno %d: %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+
+ memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
+ attr.qp_type = IBV_QPT_RC;
+ attr.send_cq = rqpair->cq;
+ attr.recv_cq = rqpair->cq;
+ attr.cap.max_send_wr = rqpair->num_entries; /* SEND operations */
+ attr.cap.max_recv_wr = rqpair->num_entries; /* RECV operations */
+ attr.cap.max_send_sge = NVME_RDMA_DEFAULT_TX_SGE;
+ attr.cap.max_recv_sge = NVME_RDMA_DEFAULT_RX_SGE;
+
+ rc = rdma_create_qp(rqpair->cm_id, NULL, &attr);
+ if (rc) {
+ SPDK_ERRLOG("rdma_create_qp failed\n");
+ return -1;
+ }
+
+ rqpair->cm_id->context = &rqpair->qpair;
+
+ return 0;
+}
+
+#define nvme_rdma_trace_ibv_sge(sg_list) \
+ if (sg_list) { \
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "local addr %p length 0x%x lkey 0x%x\n", \
+ (void *)(sg_list)->addr, (sg_list)->length, (sg_list)->lkey); \
+ }
+
+static int
+nvme_rdma_post_recv(struct nvme_rdma_qpair *rqpair, uint16_t rsp_idx)
+{
+ struct ibv_recv_wr *wr, *bad_wr = NULL;
+ int rc;
+
+ wr = &rqpair->rsp_recv_wrs[rsp_idx];
+ nvme_rdma_trace_ibv_sge(wr->sg_list);
+
+ rc = ibv_post_recv(rqpair->cm_id->qp, wr, &bad_wr);
+ if (rc) {
+ SPDK_ERRLOG("Failure posting rdma recv, rc = 0x%x\n", rc);
+ }
+
+ return rc;
+}
+
+static void
+nvme_rdma_free_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ if (rqpair->rsp_mr && rdma_dereg_mr(rqpair->rsp_mr)) {
+ SPDK_ERRLOG("Unable to de-register rsp_mr\n");
+ }
+ rqpair->rsp_mr = NULL;
+
+ free(rqpair->rsps);
+ rqpair->rsps = NULL;
+ free(rqpair->rsp_sgls);
+ rqpair->rsp_sgls = NULL;
+ free(rqpair->rsp_recv_wrs);
+ rqpair->rsp_recv_wrs = NULL;
+}
+
+static int
+nvme_rdma_alloc_rsps(struct nvme_rdma_qpair *rqpair)
+{
+ uint16_t i;
+
+ rqpair->rsp_mr = NULL;
+ rqpair->rsps = NULL;
+ rqpair->rsp_recv_wrs = NULL;
+
+ rqpair->rsp_sgls = calloc(rqpair->num_entries, sizeof(*rqpair->rsp_sgls));
+ if (!rqpair->rsp_sgls) {
+ SPDK_ERRLOG("Failed to allocate rsp_sgls\n");
+ goto fail;
+ }
+
+ rqpair->rsp_recv_wrs = calloc(rqpair->num_entries,
+ sizeof(*rqpair->rsp_recv_wrs));
+ if (!rqpair->rsp_recv_wrs) {
+ SPDK_ERRLOG("Failed to allocate rsp_recv_wrs\n");
+ goto fail;
+ }
+
+ rqpair->rsps = calloc(rqpair->num_entries, sizeof(*rqpair->rsps));
+ if (!rqpair->rsps) {
+ SPDK_ERRLOG("can not allocate rdma rsps\n");
+ goto fail;
+ }
+
+ rqpair->rsp_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->rsps,
+ rqpair->num_entries * sizeof(*rqpair->rsps));
+ if (rqpair->rsp_mr == NULL) {
+ SPDK_ERRLOG("Unable to register rsp_mr\n");
+ goto fail;
+ }
+
+ for (i = 0; i < rqpair->num_entries; i++) {
+ struct ibv_sge *rsp_sgl = &rqpair->rsp_sgls[i];
+
+ rsp_sgl->addr = (uint64_t)&rqpair->rsps[i];
+ rsp_sgl->length = sizeof(rqpair->rsps[i]);
+ rsp_sgl->lkey = rqpair->rsp_mr->lkey;
+
+ rqpair->rsp_recv_wrs[i].wr_id = i;
+ rqpair->rsp_recv_wrs[i].next = NULL;
+ rqpair->rsp_recv_wrs[i].sg_list = rsp_sgl;
+ rqpair->rsp_recv_wrs[i].num_sge = 1;
+
+ if (nvme_rdma_post_recv(rqpair, i)) {
+ SPDK_ERRLOG("Unable to post connection rx desc\n");
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ nvme_rdma_free_rsps(rqpair);
+ return -ENOMEM;
+}
+
+static void
+nvme_rdma_free_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ if (!rqpair->rdma_reqs) {
+ return;
+ }
+
+ if (rqpair->cmd_mr && rdma_dereg_mr(rqpair->cmd_mr)) {
+ SPDK_ERRLOG("Unable to de-register cmd_mr\n");
+ }
+ rqpair->cmd_mr = NULL;
+
+ free(rqpair->cmds);
+ rqpair->cmds = NULL;
+
+ free(rqpair->rdma_reqs);
+ rqpair->rdma_reqs = NULL;
+}
+
+static int
+nvme_rdma_alloc_reqs(struct nvme_rdma_qpair *rqpair)
+{
+ int i;
+
+ rqpair->rdma_reqs = calloc(rqpair->num_entries, sizeof(struct spdk_nvme_rdma_req));
+ if (rqpair->rdma_reqs == NULL) {
+ SPDK_ERRLOG("Failed to allocate rdma_reqs\n");
+ goto fail;
+ }
+
+ rqpair->cmds = calloc(rqpair->num_entries, sizeof(*rqpair->cmds));
+ if (!rqpair->cmds) {
+ SPDK_ERRLOG("Failed to allocate RDMA cmds\n");
+ goto fail;
+ }
+
+ rqpair->cmd_mr = rdma_reg_msgs(rqpair->cm_id, rqpair->cmds,
+ rqpair->num_entries * sizeof(*rqpair->cmds));
+ if (!rqpair->cmd_mr) {
+ SPDK_ERRLOG("Unable to register cmd_mr\n");
+ goto fail;
+ }
+
+ TAILQ_INIT(&rqpair->free_reqs);
+ TAILQ_INIT(&rqpair->outstanding_reqs);
+ for (i = 0; i < rqpair->num_entries; i++) {
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct spdk_nvmf_cmd *cmd;
+
+ rdma_req = &rqpair->rdma_reqs[i];
+ cmd = &rqpair->cmds[i];
+
+ rdma_req->id = i;
+
+ /* The first RDMA sgl element will always point
+ * at this data structure. Depending on whether
+ * an NVMe-oF SGL is required, the length of
+ * this element may change. */
+ rdma_req->send_sgl[0].addr = (uint64_t)cmd;
+ rdma_req->send_sgl[0].lkey = rqpair->cmd_mr->lkey;
+
+ rdma_req->send_wr.wr_id = (uint64_t)rdma_req;
+ rdma_req->send_wr.next = NULL;
+ rdma_req->send_wr.opcode = IBV_WR_SEND;
+ rdma_req->send_wr.send_flags = IBV_SEND_SIGNALED;
+ rdma_req->send_wr.sg_list = rdma_req->send_sgl;
+ rdma_req->send_wr.imm_data = 0;
+
+ TAILQ_INSERT_TAIL(&rqpair->free_reqs, rdma_req, link);
+ }
+
+ return 0;
+
+fail:
+ nvme_rdma_free_reqs(rqpair);
+ return -ENOMEM;
+}
+
+static int
+nvme_rdma_recv(struct nvme_rdma_qpair *rqpair, uint64_t rsp_idx)
+{
+ struct spdk_nvme_qpair *qpair = &rqpair->qpair;
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct spdk_nvme_cpl *rsp;
+ struct nvme_request *req;
+
+ assert(rsp_idx < rqpair->num_entries);
+ rsp = &rqpair->rsps[rsp_idx];
+ rdma_req = &rqpair->rdma_reqs[rsp->cid];
+
+ req = rdma_req->req;
+ nvme_rdma_req_complete(req, rsp);
+
+ nvme_rdma_req_put(rqpair, rdma_req);
+ if (nvme_rdma_post_recv(rqpair, rsp_idx)) {
+ SPDK_ERRLOG("Unable to re-post rx descriptor\n");
+ return -1;
+ }
+
+ if (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->is_resetting) {
+ req = STAILQ_FIRST(&qpair->queued_req);
+ STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
+ nvme_qpair_submit_request(qpair, req);
+ }
+
+ return 0;
+}
+
+static int
+nvme_rdma_resolve_addr(struct nvme_rdma_qpair *rqpair,
+ struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_event_channel *cm_channel)
+{
+ int ret;
+ struct rdma_cm_event *event;
+
+ ret = rdma_resolve_addr(rqpair->cm_id, src_addr, dst_addr,
+ NVME_RDMA_TIME_OUT_IN_MS);
+ if (ret) {
+ SPDK_ERRLOG("rdma_resolve_addr, %d\n", errno);
+ return ret;
+ }
+
+ event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ADDR_RESOLVED);
+ if (event == NULL) {
+ SPDK_ERRLOG("RDMA address resolution error\n");
+ return -1;
+ }
+ rdma_ack_cm_event(event);
+
+ ret = rdma_resolve_route(rqpair->cm_id, NVME_RDMA_TIME_OUT_IN_MS);
+ if (ret) {
+ SPDK_ERRLOG("rdma_resolve_route\n");
+ return ret;
+ }
+
+ event = nvme_rdma_get_event(cm_channel, RDMA_CM_EVENT_ROUTE_RESOLVED);
+ if (event == NULL) {
+ SPDK_ERRLOG("RDMA route resolution error\n");
+ return -1;
+ }
+ rdma_ack_cm_event(event);
+
+ return 0;
+}
+
+static int
+nvme_rdma_connect(struct nvme_rdma_qpair *rqpair)
+{
+ struct rdma_conn_param param = {};
+ struct spdk_nvmf_rdma_request_private_data request_data = {};
+ struct spdk_nvmf_rdma_accept_private_data *accept_data;
+ struct ibv_device_attr attr;
+ int ret;
+ struct rdma_cm_event *event;
+ struct spdk_nvme_ctrlr *ctrlr;
+
+ ret = ibv_query_device(rqpair->cm_id->verbs, &attr);
+ if (ret != 0) {
+ SPDK_ERRLOG("Failed to query RDMA device attributes.\n");
+ return ret;
+ }
+
+ param.responder_resources = spdk_min(rqpair->num_entries, attr.max_qp_rd_atom);
+
+ ctrlr = rqpair->qpair.ctrlr;
+ if (!ctrlr) {
+ return -1;
+ }
+
+ request_data.qid = rqpair->qpair.id;
+ request_data.hrqsize = rqpair->num_entries;
+ request_data.hsqsize = rqpair->num_entries - 1;
+ request_data.cntlid = ctrlr->cntlid;
+
+ param.private_data = &request_data;
+ param.private_data_len = sizeof(request_data);
+ param.retry_count = 7;
+ param.rnr_retry_count = 7;
+
+ ret = rdma_connect(rqpair->cm_id, &param);
+ if (ret) {
+ SPDK_ERRLOG("nvme rdma connect error\n");
+ return ret;
+ }
+
+ event = nvme_rdma_get_event(rqpair->cm_channel, RDMA_CM_EVENT_ESTABLISHED);
+ if (event == NULL) {
+ SPDK_ERRLOG("RDMA connect error\n");
+ return -1;
+ }
+
+ accept_data = (struct spdk_nvmf_rdma_accept_private_data *)event->param.conn.private_data;
+ if (accept_data == NULL) {
+ rdma_ack_cm_event(event);
+ SPDK_ERRLOG("NVMe-oF target did not return accept data\n");
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "Requested queue depth %d. Actually got queue depth %d.\n",
+ rqpair->num_entries, accept_data->crqsize);
+
+ rqpair->num_entries = spdk_min(rqpair->num_entries, accept_data->crqsize);
+
+ rdma_ack_cm_event(event);
+
+ return 0;
+}
+
+static int
+nvme_rdma_parse_addr(struct sockaddr_storage *sa, int family, const char *addr, const char *service)
+{
+ struct addrinfo *res;
+ struct addrinfo hints;
+ int ret;
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_family = family;
+ hints.ai_socktype = SOCK_STREAM;
+ hints.ai_protocol = 0;
+
+ ret = getaddrinfo(addr, service, &hints, &res);
+ if (ret) {
+ SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(ret), ret);
+ return ret;
+ }
+
+ if (res->ai_addrlen > sizeof(*sa)) {
+ SPDK_ERRLOG("getaddrinfo() ai_addrlen %zu too large\n", (size_t)res->ai_addrlen);
+ ret = EINVAL;
+ } else {
+ memcpy(sa, res->ai_addr, res->ai_addrlen);
+ }
+
+ freeaddrinfo(res);
+ return ret;
+}
+
+static int
+nvme_rdma_mr_map_notify(void *cb_ctx, struct spdk_mem_map *map,
+ enum spdk_mem_map_notify_action action,
+ void *vaddr, size_t size)
+{
+ struct ibv_pd *pd = cb_ctx;
+ struct ibv_mr *mr;
+ int rc;
+
+ switch (action) {
+ case SPDK_MEM_MAP_NOTIFY_REGISTER:
+ mr = ibv_reg_mr(pd, vaddr, size,
+ IBV_ACCESS_LOCAL_WRITE |
+ IBV_ACCESS_REMOTE_READ |
+ IBV_ACCESS_REMOTE_WRITE);
+ if (mr == NULL) {
+ SPDK_ERRLOG("ibv_reg_mr() failed\n");
+ return -EFAULT;
+ } else {
+ rc = spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr);
+ }
+ break;
+ case SPDK_MEM_MAP_NOTIFY_UNREGISTER:
+ mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL);
+ rc = spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size);
+ if (mr) {
+ ibv_dereg_mr(mr);
+ }
+ break;
+ default:
+ SPDK_UNREACHABLE();
+ }
+
+ return rc;
+}
+
+static int
+nvme_rdma_register_mem(struct nvme_rdma_qpair *rqpair)
+{
+ struct ibv_pd *pd = rqpair->cm_id->qp->pd;
+ struct spdk_nvme_rdma_mr_map *mr_map;
+ const struct spdk_mem_map_ops nvme_rdma_map_ops = {
+ .notify_cb = nvme_rdma_mr_map_notify,
+ .are_contiguous = NULL
+ };
+
+ pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+ /* Look up existing mem map registration for this pd */
+ LIST_FOREACH(mr_map, &g_rdma_mr_maps, link) {
+ if (mr_map->pd == pd) {
+ mr_map->ref++;
+ rqpair->mr_map = mr_map;
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return 0;
+ }
+ }
+
+ mr_map = calloc(1, sizeof(*mr_map));
+ if (mr_map == NULL) {
+ SPDK_ERRLOG("calloc() failed\n");
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return -1;
+ }
+
+ mr_map->ref = 1;
+ mr_map->pd = pd;
+ mr_map->map = spdk_mem_map_alloc((uint64_t)NULL, &nvme_rdma_map_ops, pd);
+ if (mr_map->map == NULL) {
+ SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
+ free(mr_map);
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+ return -1;
+ }
+
+ rqpair->mr_map = mr_map;
+ LIST_INSERT_HEAD(&g_rdma_mr_maps, mr_map, link);
+
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+
+ return 0;
+}
+
+static void
+nvme_rdma_unregister_mem(struct nvme_rdma_qpair *rqpair)
+{
+ struct spdk_nvme_rdma_mr_map *mr_map;
+
+ mr_map = rqpair->mr_map;
+ rqpair->mr_map = NULL;
+
+ if (mr_map == NULL) {
+ return;
+ }
+
+ pthread_mutex_lock(&g_rdma_mr_maps_mutex);
+
+ assert(mr_map->ref > 0);
+ mr_map->ref--;
+ if (mr_map->ref == 0) {
+ LIST_REMOVE(mr_map, link);
+ spdk_mem_map_free(&mr_map->map);
+ free(mr_map);
+ }
+
+ pthread_mutex_unlock(&g_rdma_mr_maps_mutex);
+}
+
+static int
+nvme_rdma_qpair_connect(struct nvme_rdma_qpair *rqpair)
+{
+ struct sockaddr_storage dst_addr;
+ struct sockaddr_storage src_addr;
+ bool src_addr_specified;
+ int rc;
+ struct spdk_nvme_ctrlr *ctrlr;
+ int family;
+
+ rqpair->cm_channel = rdma_create_event_channel();
+ if (rqpair->cm_channel == NULL) {
+ SPDK_ERRLOG("rdma_create_event_channel() failed\n");
+ return -1;
+ }
+
+ ctrlr = rqpair->qpair.ctrlr;
+
+ switch (ctrlr->trid.adrfam) {
+ case SPDK_NVMF_ADRFAM_IPV4:
+ family = AF_INET;
+ break;
+ case SPDK_NVMF_ADRFAM_IPV6:
+ family = AF_INET6;
+ break;
+ default:
+ SPDK_ERRLOG("Unhandled ADRFAM %d\n", ctrlr->trid.adrfam);
+ return -1;
+ }
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "adrfam %d ai_family %d\n", ctrlr->trid.adrfam, family);
+
+ memset(&dst_addr, 0, sizeof(dst_addr));
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "trsvcid is %s\n", ctrlr->trid.trsvcid);
+ rc = nvme_rdma_parse_addr(&dst_addr, family, ctrlr->trid.traddr, ctrlr->trid.trsvcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("dst_addr nvme_rdma_parse_addr() failed\n");
+ return -1;
+ }
+
+ if (ctrlr->opts.src_addr[0] || ctrlr->opts.src_svcid[0]) {
+ memset(&src_addr, 0, sizeof(src_addr));
+ rc = nvme_rdma_parse_addr(&src_addr, family, ctrlr->opts.src_addr, ctrlr->opts.src_svcid);
+ if (rc != 0) {
+ SPDK_ERRLOG("src_addr nvme_rdma_parse_addr() failed\n");
+ return -1;
+ }
+ src_addr_specified = true;
+ } else {
+ src_addr_specified = false;
+ }
+
+ rc = rdma_create_id(rqpair->cm_channel, &rqpair->cm_id, rqpair, RDMA_PS_TCP);
+ if (rc < 0) {
+ SPDK_ERRLOG("rdma_create_id() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_resolve_addr(rqpair,
+ src_addr_specified ? (struct sockaddr *)&src_addr : NULL,
+ (struct sockaddr *)&dst_addr, rqpair->cm_channel);
+ if (rc < 0) {
+ SPDK_ERRLOG("nvme_rdma_resolve_addr() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_qpair_init(rqpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("nvme_rdma_qpair_init() failed\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_connect(rqpair);
+ if (rc != 0) {
+ SPDK_ERRLOG("Unable to connect the rqpair\n");
+ return -1;
+ }
+
+ rc = nvme_rdma_alloc_reqs(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc) {
+ SPDK_ERRLOG("Unable to allocate rqpair RDMA requests\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA requests allocated\n");
+
+ rc = nvme_rdma_alloc_rsps(rqpair);
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "rc =%d\n", rc);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to allocate rqpair RDMA responses\n");
+ return -1;
+ }
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "RDMA responses allocated\n");
+
+ rc = nvme_rdma_register_mem(rqpair);
+ if (rc < 0) {
+ SPDK_ERRLOG("Unable to register memory for RDMA\n");
+ return -1;
+ }
+
+ rc = nvme_fabric_qpair_connect(&rqpair->qpair, rqpair->num_entries);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to send an NVMe-oF Fabric CONNECT command\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Build SGL describing empty payload.
+ */
+static int
+nvme_rdma_build_null_request(struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ /* The RDMA SGL needs one element describing the NVMe command. */
+ rdma_req->send_wr.num_sge = 1;
+
+ req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ req->cmd.dptr.sgl1.keyed.length = 0;
+ req->cmd.dptr.sgl1.keyed.key = 0;
+ req->cmd.dptr.sgl1.address = 0;
+
+ return 0;
+}
+
+/*
+ * Build inline SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_inline_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ struct ibv_mr *mr;
+ void *payload;
+ uint64_t requested_size;
+
+ payload = req->payload.contig_or_cb_arg + req->payload_offset;
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ requested_size = req->payload_size;
+ mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map,
+ (uint64_t)payload, &requested_size);
+
+ if (mr == NULL || requested_size < req->payload_size) {
+ return -EINVAL;
+ }
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ rdma_req->send_sgl[1].addr = (uint64_t)payload;
+ rdma_req->send_sgl[1].length = (uint32_t)req->payload_size;
+ rdma_req->send_sgl[1].lkey = mr->lkey;
+
+ /* The RDMA SGL contains two elements. The first describes
+ * the NVMe command and the second describes the data
+ * payload. */
+ rdma_req->send_wr.num_sge = 2;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+ /* Inline only supported for icdoff == 0 currently. This function will
+ * not get called for controllers with other values. */
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+ return 0;
+}
+
+/*
+ * Build SGL describing contiguous payload buffer.
+ */
+static int
+nvme_rdma_build_contig_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ void *payload = req->payload.contig_or_cb_arg + req->payload_offset;
+ struct ibv_mr *mr;
+ uint64_t requested_size;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG);
+
+ requested_size = req->payload_size;
+ mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)payload,
+ &requested_size);
+ if (mr == NULL || requested_size < req->payload_size) {
+ return -1;
+ }
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ /* The RDMA SGL needs one element describing the NVMe command. */
+ rdma_req->send_wr.num_sge = 1;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ req->cmd.dptr.sgl1.keyed.length = req->payload_size;
+ req->cmd.dptr.sgl1.keyed.key = mr->rkey;
+ req->cmd.dptr.sgl1.address = (uint64_t)payload;
+
+ return 0;
+}
+
+/*
+ * Build SGL describing scattered payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ struct spdk_nvmf_cmd *cmd = &rqpair->cmds[rdma_req->id];
+ struct ibv_mr *mr = NULL;
+ void *virt_addr;
+ uint64_t remaining_size, mr_length;
+ uint32_t sge_length;
+ int rc, max_num_sgl, num_sgl_desc;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ max_num_sgl = req->qpair->ctrlr->max_sges;
+
+ remaining_size = req->payload_size;
+ num_sgl_desc = 0;
+ do {
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &sge_length);
+ if (rc) {
+ return -1;
+ }
+
+ sge_length = spdk_min(remaining_size, sge_length);
+ mr_length = sge_length;
+
+ mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr,
+ &mr_length);
+
+ if (mr == NULL || mr_length < sge_length) {
+ return -1;
+ }
+
+ cmd->sgl[num_sgl_desc].keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ cmd->sgl[num_sgl_desc].keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ cmd->sgl[num_sgl_desc].keyed.length = sge_length;
+ cmd->sgl[num_sgl_desc].keyed.key = mr->rkey;
+ cmd->sgl[num_sgl_desc].address = (uint64_t)virt_addr;
+
+ remaining_size -= sge_length;
+ num_sgl_desc++;
+ } while (remaining_size > 0 && num_sgl_desc < max_num_sgl);
+
+
+ /* Should be impossible if we did our sgl checks properly up the stack, but do a sanity check here. */
+ if (remaining_size > 0) {
+ return -1;
+ }
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+
+ /* The RDMA SGL needs one element describing some portion
+ * of the spdk_nvmf_cmd structure. */
+ rdma_req->send_wr.num_sge = 1;
+
+ /*
+ * If only one SGL descriptor is required, it can be embedded directly in the command
+ * as a data block descriptor.
+ */
+ if (num_sgl_desc == 1) {
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ req->cmd.dptr.sgl1.keyed.type = SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK;
+ req->cmd.dptr.sgl1.keyed.subtype = SPDK_NVME_SGL_SUBTYPE_ADDRESS;
+ req->cmd.dptr.sgl1.keyed.length = req->payload_size;
+ req->cmd.dptr.sgl1.keyed.key = mr->rkey;
+ req->cmd.dptr.sgl1.address = rqpair->cmds[rdma_req->id].sgl[0].address;
+ } else {
+ /*
+ * Otherwise, The SGL descriptor embedded in the command must point to the list of
+ * SGL descriptors used to describe the operation. In that case it is a last segment descriptor.
+ */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd) + sizeof(struct
+ spdk_nvme_sgl_descriptor) * num_sgl_desc;
+
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_LAST_SEGMENT;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = num_sgl_desc * sizeof(struct spdk_nvme_sgl_descriptor);
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+ }
+
+ return 0;
+}
+
+/*
+ * Build inline SGL describing sgl payload buffer.
+ */
+static int
+nvme_rdma_build_sgl_inline_request(struct nvme_rdma_qpair *rqpair,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct nvme_request *req = rdma_req->req;
+ struct ibv_mr *mr;
+ uint32_t length;
+ uint64_t requested_size;
+ void *virt_addr;
+ int rc;
+
+ assert(req->payload_size != 0);
+ assert(nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL);
+ assert(req->payload.reset_sgl_fn != NULL);
+ assert(req->payload.next_sge_fn != NULL);
+ req->payload.reset_sgl_fn(req->payload.contig_or_cb_arg, req->payload_offset);
+
+ /* TODO: for now, we only support a single SGL entry */
+ rc = req->payload.next_sge_fn(req->payload.contig_or_cb_arg, &virt_addr, &length);
+ if (rc) {
+ return -1;
+ }
+
+ if (length < req->payload_size) {
+ SPDK_ERRLOG("multi-element SGL currently not supported for RDMA\n");
+ return -1;
+ }
+
+ requested_size = req->payload_size;
+ mr = (struct ibv_mr *)spdk_mem_map_translate(rqpair->mr_map->map, (uint64_t)virt_addr,
+ &requested_size);
+ if (mr == NULL || requested_size < req->payload_size) {
+ return -1;
+ }
+
+ /* The first element of this SGL is pointing at an
+ * spdk_nvmf_cmd object. For this particular command,
+ * we only need the first 64 bytes corresponding to
+ * the NVMe command. */
+ rdma_req->send_sgl[0].length = sizeof(struct spdk_nvme_cmd);
+
+ rdma_req->send_sgl[1].addr = (uint64_t)virt_addr;
+ rdma_req->send_sgl[1].length = (uint32_t)req->payload_size;
+ rdma_req->send_sgl[1].lkey = mr->lkey;
+
+ /* The RDMA SGL contains two elements. The first describes
+ * the NVMe command and the second describes the data
+ * payload. */
+ rdma_req->send_wr.num_sge = 2;
+
+ req->cmd.psdt = SPDK_NVME_PSDT_SGL_MPTR_CONTIG;
+ req->cmd.dptr.sgl1.unkeyed.type = SPDK_NVME_SGL_TYPE_DATA_BLOCK;
+ req->cmd.dptr.sgl1.unkeyed.subtype = SPDK_NVME_SGL_SUBTYPE_OFFSET;
+ req->cmd.dptr.sgl1.unkeyed.length = (uint32_t)req->payload_size;
+ /* Inline only supported for icdoff == 0 currently. This function will
+ * not get called for controllers with other values. */
+ req->cmd.dptr.sgl1.address = (uint64_t)0;
+
+ return 0;
+}
+
+static inline unsigned int
+nvme_rdma_icdsz_bytes(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return (ctrlr->cdata.nvmf_specific.ioccsz * 16 - sizeof(struct spdk_nvme_cmd));
+}
+
+static int
+nvme_rdma_req_init(struct nvme_rdma_qpair *rqpair, struct nvme_request *req,
+ struct spdk_nvme_rdma_req *rdma_req)
+{
+ struct spdk_nvme_ctrlr *ctrlr = rqpair->qpair.ctrlr;
+ int rc;
+
+ rdma_req->req = req;
+ req->cmd.cid = rdma_req->id;
+
+ if (req->payload_size == 0) {
+ rc = nvme_rdma_build_null_request(rdma_req);
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_CONTIG) {
+ /*
+ * Check if icdoff is non zero, to avoid interop conflicts with
+ * targets with non-zero icdoff. Both SPDK and the Linux kernel
+ * targets use icdoff = 0. For targets with non-zero icdoff, we
+ * will currently just not use inline data for now.
+ */
+ if (req->cmd.opc == SPDK_NVME_OPC_WRITE &&
+ req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) &&
+ (ctrlr->cdata.nvmf_specific.icdoff == 0)) {
+ rc = nvme_rdma_build_contig_inline_request(rqpair, rdma_req);
+ } else {
+ rc = nvme_rdma_build_contig_request(rqpair, rdma_req);
+ }
+ } else if (nvme_payload_type(&req->payload) == NVME_PAYLOAD_TYPE_SGL) {
+ if (req->cmd.opc == SPDK_NVME_OPC_WRITE &&
+ req->payload_size <= nvme_rdma_icdsz_bytes(ctrlr) &&
+ ctrlr->cdata.nvmf_specific.icdoff == 0) {
+ rc = nvme_rdma_build_sgl_inline_request(rqpair, rdma_req);
+ } else {
+ rc = nvme_rdma_build_sgl_request(rqpair, rdma_req);
+ }
+ } else {
+ rc = -1;
+ }
+
+ if (rc) {
+ return rc;
+ }
+
+ memcpy(&rqpair->cmds[rdma_req->id], &req->cmd, sizeof(req->cmd));
+ return 0;
+}
+
+static struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_qpair(struct spdk_nvme_ctrlr *ctrlr,
+ uint16_t qid, uint32_t qsize,
+ enum spdk_nvme_qprio qprio,
+ uint32_t num_requests)
+{
+ struct nvme_rdma_qpair *rqpair;
+ struct spdk_nvme_qpair *qpair;
+ int rc;
+
+ rqpair = calloc(1, sizeof(struct nvme_rdma_qpair));
+ if (!rqpair) {
+ SPDK_ERRLOG("failed to get create rqpair\n");
+ return NULL;
+ }
+
+ rqpair->num_entries = qsize;
+
+ qpair = &rqpair->qpair;
+
+ rc = nvme_qpair_init(qpair, qid, ctrlr, qprio, num_requests);
+ if (rc != 0) {
+ return NULL;
+ }
+
+ rc = nvme_rdma_qpair_connect(rqpair);
+ if (rc < 0) {
+ nvme_rdma_qpair_destroy(qpair);
+ return NULL;
+ }
+
+ return qpair;
+}
+
+static int
+nvme_rdma_qpair_destroy(struct spdk_nvme_qpair *qpair)
+{
+ struct nvme_rdma_qpair *rqpair;
+
+ if (!qpair) {
+ return -1;
+ }
+ nvme_qpair_deinit(qpair);
+
+ rqpair = nvme_rdma_qpair(qpair);
+
+ nvme_rdma_unregister_mem(rqpair);
+ nvme_rdma_free_reqs(rqpair);
+ nvme_rdma_free_rsps(rqpair);
+
+ if (rqpair->cm_id) {
+ if (rqpair->cm_id->qp) {
+ rdma_destroy_qp(rqpair->cm_id);
+ }
+ rdma_destroy_id(rqpair->cm_id);
+ }
+
+ if (rqpair->cq) {
+ ibv_destroy_cq(rqpair->cq);
+ }
+
+ if (rqpair->cm_channel) {
+ rdma_destroy_event_channel(rqpair->cm_channel);
+ }
+
+ free(rqpair);
+
+ return 0;
+}
+
+struct spdk_nvme_qpair *
+nvme_rdma_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ return nvme_rdma_ctrlr_create_qpair(ctrlr, qid, opts->io_queue_size, opts->qprio,
+ opts->io_queue_requests);
+}
+
+int
+nvme_rdma_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /* do nothing here */
+ return 0;
+}
+
+/* This function must only be called while holding g_spdk_nvme_driver->lock */
+int
+nvme_rdma_ctrlr_scan(const struct spdk_nvme_transport_id *discovery_trid,
+ void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_remove_cb remove_cb,
+ bool direct_connect)
+{
+ struct spdk_nvme_ctrlr_opts discovery_opts;
+ struct spdk_nvme_ctrlr *discovery_ctrlr;
+ union spdk_nvme_cc_register cc;
+ int rc;
+ struct nvme_completion_poll_status status;
+
+ if (strcmp(discovery_trid->subnqn, SPDK_NVMF_DISCOVERY_NQN) != 0) {
+ /* It is not a discovery_ctrlr info and try to directly connect it */
+ rc = nvme_ctrlr_probe(discovery_trid, NULL, probe_cb, cb_ctx);
+ return rc;
+ }
+
+ spdk_nvme_ctrlr_get_default_ctrlr_opts(&discovery_opts, sizeof(discovery_opts));
+ /* For discovery_ctrlr set the timeout to 0 */
+ discovery_opts.keep_alive_timeout_ms = 0;
+
+ discovery_ctrlr = nvme_rdma_ctrlr_construct(discovery_trid, &discovery_opts, NULL);
+ if (discovery_ctrlr == NULL) {
+ return -1;
+ }
+
+ /* TODO: this should be using the normal NVMe controller initialization process */
+ cc.raw = 0;
+ cc.bits.en = 1;
+ cc.bits.iosqes = 6; /* SQ entry size == 64 == 2^6 */
+ cc.bits.iocqes = 4; /* CQ entry size == 16 == 2^4 */
+ rc = nvme_transport_ctrlr_set_reg_4(discovery_ctrlr, offsetof(struct spdk_nvme_registers, cc.raw),
+ cc.raw);
+ if (rc < 0) {
+ SPDK_ERRLOG("Failed to set cc\n");
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ return -1;
+ }
+
+ /* get the cdata info */
+ rc = nvme_ctrlr_cmd_identify(discovery_ctrlr, SPDK_NVME_IDENTIFY_CTRLR, 0, 0,
+ &discovery_ctrlr->cdata, sizeof(discovery_ctrlr->cdata),
+ nvme_completion_poll_cb, &status);
+ if (rc != 0) {
+ SPDK_ERRLOG("Failed to identify cdata\n");
+ return rc;
+ }
+
+ if (spdk_nvme_wait_for_completion(discovery_ctrlr->adminq, &status)) {
+ SPDK_ERRLOG("nvme_identify_controller failed!\n");
+ return -ENXIO;
+ }
+
+ /* Direct attach through spdk_nvme_connect() API */
+ if (direct_connect == true) {
+ /* Set the ready state to skip the normal init process */
+ discovery_ctrlr->state = NVME_CTRLR_STATE_READY;
+ nvme_ctrlr_connected(discovery_ctrlr);
+ nvme_ctrlr_add_process(discovery_ctrlr, 0);
+ return 0;
+ }
+
+ rc = nvme_fabric_ctrlr_discover(discovery_ctrlr, cb_ctx, probe_cb);
+ nvme_ctrlr_destruct(discovery_ctrlr);
+ return rc;
+}
+
+struct spdk_nvme_ctrlr *nvme_rdma_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ struct nvme_rdma_ctrlr *rctrlr;
+ union spdk_nvme_cap_register cap;
+ union spdk_nvme_vs_register vs;
+ int rc;
+
+ rctrlr = calloc(1, sizeof(struct nvme_rdma_ctrlr));
+ if (rctrlr == NULL) {
+ SPDK_ERRLOG("could not allocate ctrlr\n");
+ return NULL;
+ }
+
+ rctrlr->ctrlr.trid.trtype = SPDK_NVME_TRANSPORT_RDMA;
+ rctrlr->ctrlr.opts = *opts;
+ memcpy(&rctrlr->ctrlr.trid, trid, sizeof(rctrlr->ctrlr.trid));
+
+ rc = nvme_ctrlr_construct(&rctrlr->ctrlr);
+ if (rc != 0) {
+ free(rctrlr);
+ return NULL;
+ }
+
+ rctrlr->ctrlr.adminq = nvme_rdma_ctrlr_create_qpair(&rctrlr->ctrlr, 0,
+ SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES, 0, SPDK_NVMF_MIN_ADMIN_QUEUE_ENTRIES);
+ if (!rctrlr->ctrlr.adminq) {
+ SPDK_ERRLOG("failed to create admin qpair\n");
+ nvme_rdma_ctrlr_destruct(&rctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_cap(&rctrlr->ctrlr, &cap)) {
+ SPDK_ERRLOG("get_cap() failed\n");
+ nvme_ctrlr_destruct(&rctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_get_vs(&rctrlr->ctrlr, &vs)) {
+ SPDK_ERRLOG("get_vs() failed\n");
+ nvme_ctrlr_destruct(&rctrlr->ctrlr);
+ return NULL;
+ }
+
+ if (nvme_ctrlr_add_process(&rctrlr->ctrlr, 0) != 0) {
+ SPDK_ERRLOG("nvme_ctrlr_add_process() failed\n");
+ nvme_ctrlr_destruct(&rctrlr->ctrlr);
+ return NULL;
+ }
+
+ nvme_ctrlr_init_cap(&rctrlr->ctrlr, &cap, &vs);
+
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "successfully initialized the nvmf ctrlr\n");
+ return &rctrlr->ctrlr;
+}
+
+int
+nvme_rdma_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ struct nvme_rdma_ctrlr *rctrlr = nvme_rdma_ctrlr(ctrlr);
+
+ if (ctrlr->adminq) {
+ nvme_rdma_qpair_destroy(ctrlr->adminq);
+ }
+
+ nvme_ctrlr_destruct_finish(ctrlr);
+
+ free(rctrlr);
+
+ return 0;
+}
+
+int
+nvme_rdma_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ return nvme_fabric_ctrlr_set_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_rdma_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ return nvme_fabric_ctrlr_set_reg_8(ctrlr, offset, value);
+}
+
+int
+nvme_rdma_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ return nvme_fabric_ctrlr_get_reg_4(ctrlr, offset, value);
+}
+
+int
+nvme_rdma_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ return nvme_fabric_ctrlr_get_reg_8(ctrlr, offset, value);
+}
+
+int
+nvme_rdma_qpair_submit_request(struct spdk_nvme_qpair *qpair,
+ struct nvme_request *req)
+{
+ struct nvme_rdma_qpair *rqpair;
+ struct spdk_nvme_rdma_req *rdma_req;
+ struct ibv_send_wr *wr, *bad_wr = NULL;
+ int rc;
+
+ rqpair = nvme_rdma_qpair(qpair);
+ assert(rqpair != NULL);
+ assert(req != NULL);
+
+ rdma_req = nvme_rdma_req_get(rqpair);
+ if (!rdma_req) {
+ /*
+ * No rdma_req is available. Queue the request to be processed later.
+ */
+ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq);
+ return 0;
+ }
+
+ if (nvme_rdma_req_init(rqpair, req, rdma_req)) {
+ SPDK_ERRLOG("nvme_rdma_req_init() failed\n");
+ nvme_rdma_req_put(rqpair, rdma_req);
+ return -1;
+ }
+
+ req->timed_out = false;
+ if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) {
+ req->submit_tick = spdk_get_ticks();
+ } else {
+ req->submit_tick = 0;
+ }
+
+ wr = &rdma_req->send_wr;
+
+ nvme_rdma_trace_ibv_sge(wr->sg_list);
+
+ rc = ibv_post_send(rqpair->cm_id->qp, wr, &bad_wr);
+ if (rc) {
+ SPDK_ERRLOG("Failure posting rdma send for NVMf completion: %d (%s)\n", rc, spdk_strerror(rc));
+ }
+
+ return rc;
+}
+
+int
+nvme_rdma_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ return nvme_rdma_qpair_destroy(qpair);
+}
+
+int
+nvme_rdma_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ return nvme_rdma_qpair_connect(nvme_rdma_qpair(qpair));
+}
+
+int
+nvme_rdma_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ /* Currently, doing nothing here */
+ return 0;
+}
+
+int
+nvme_rdma_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+ /* Currently, doing nothing here */
+ return 0;
+}
+
+int
+nvme_rdma_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ /* Currently, doing nothing here */
+ return 0;
+}
+
+int
+nvme_rdma_qpair_fail(struct spdk_nvme_qpair *qpair)
+{
+ /* Currently, doing nothing here */
+ return 0;
+}
+
+static void
+nvme_rdma_qpair_check_timeout(struct spdk_nvme_qpair *qpair)
+{
+ uint64_t t02;
+ struct spdk_nvme_rdma_req *rdma_req, *tmp;
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct spdk_nvme_ctrlr *ctrlr = qpair->ctrlr;
+ struct spdk_nvme_ctrlr_process *active_proc;
+
+ /* Don't check timeouts during controller initialization. */
+ if (ctrlr->state != NVME_CTRLR_STATE_READY) {
+ return;
+ }
+
+ if (nvme_qpair_is_admin_queue(qpair)) {
+ active_proc = spdk_nvme_ctrlr_get_current_process(ctrlr);
+ } else {
+ active_proc = qpair->active_proc;
+ }
+
+ /* Only check timeouts if the current process has a timeout callback. */
+ if (active_proc == NULL || active_proc->timeout_cb_fn == NULL) {
+ return;
+ }
+
+ t02 = spdk_get_ticks();
+ TAILQ_FOREACH_SAFE(rdma_req, &rqpair->outstanding_reqs, link, tmp) {
+ assert(rdma_req->req != NULL);
+
+ if (nvme_request_check_timeout(rdma_req->req, rdma_req->id, active_proc, t02)) {
+ /*
+ * The requests are in order, so as soon as one has not timed out,
+ * stop iterating.
+ */
+ break;
+ }
+ }
+}
+
+#define MAX_COMPLETIONS_PER_POLL 128
+
+int
+nvme_rdma_qpair_process_completions(struct spdk_nvme_qpair *qpair,
+ uint32_t max_completions)
+{
+ struct nvme_rdma_qpair *rqpair = nvme_rdma_qpair(qpair);
+ struct ibv_wc wc[MAX_COMPLETIONS_PER_POLL];
+ int i, rc, batch_size;
+ uint32_t reaped;
+ struct ibv_cq *cq;
+
+ if (max_completions == 0) {
+ max_completions = rqpair->num_entries;
+ } else {
+ max_completions = spdk_min(max_completions, rqpair->num_entries);
+ }
+
+ cq = rqpair->cq;
+
+ reaped = 0;
+ do {
+ batch_size = spdk_min((max_completions - reaped),
+ MAX_COMPLETIONS_PER_POLL);
+ rc = ibv_poll_cq(cq, batch_size, wc);
+ if (rc < 0) {
+ SPDK_ERRLOG("Error polling CQ! (%d): %s\n",
+ errno, spdk_strerror(errno));
+ return -1;
+ } else if (rc == 0) {
+ /* Ran out of completions */
+ break;
+ }
+
+ for (i = 0; i < rc; i++) {
+ if (wc[i].status) {
+ SPDK_ERRLOG("CQ error on Queue Pair %p, Response Index %lu (%d): %s\n",
+ qpair, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status));
+ return -1;
+ }
+
+ switch (wc[i].opcode) {
+ case IBV_WC_RECV:
+ SPDK_DEBUGLOG(SPDK_LOG_NVME, "CQ recv completion\n");
+
+ reaped++;
+
+ if (wc[i].byte_len < sizeof(struct spdk_nvme_cpl)) {
+ SPDK_ERRLOG("recv length %u less than expected response size\n", wc[i].byte_len);
+ return -1;
+ }
+
+ if (nvme_rdma_recv(rqpair, wc[i].wr_id)) {
+ SPDK_ERRLOG("nvme_rdma_recv processing failure\n");
+ return -1;
+ }
+ break;
+
+ case IBV_WC_SEND:
+ break;
+
+ default:
+ SPDK_ERRLOG("Received an unexpected opcode on the CQ: %d\n", wc[i].opcode);
+ return -1;
+ }
+ }
+ } while (reaped < max_completions);
+
+ if (spdk_unlikely(rqpair->qpair.ctrlr->timeout_enabled)) {
+ nvme_rdma_qpair_check_timeout(qpair);
+ }
+
+ return reaped;
+}
+
+uint32_t
+nvme_rdma_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ /* Todo, which should get from the NVMF target */
+ return NVME_RDMA_RW_BUFFER_SIZE;
+}
+
+uint16_t
+nvme_rdma_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ return spdk_min(ctrlr->cdata.nvmf_specific.msdbd, NVME_RDMA_MAX_SGL_DESCRIPTORS);
+}
+
+void *
+nvme_rdma_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
+{
+ return NULL;
+}
+
+int
+nvme_rdma_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
+{
+ return 0;
+}
diff --git a/src/spdk/lib/nvme/nvme_transport.c b/src/spdk/lib/nvme/nvme_transport.c
new file mode 100644
index 00000000..56052a0f
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_transport.c
@@ -0,0 +1,219 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * NVMe transport abstraction
+ */
+
+#include "nvme_internal.h"
+
+#ifdef DEBUG
+static __attribute__((noreturn)) void
+nvme_transport_unknown(enum spdk_nvme_transport_type trtype)
+{
+ SPDK_ERRLOG("Unknown transport %d\n", (int)trtype);
+ abort();
+}
+#define TRANSPORT_DEFAULT(trtype) default: nvme_transport_unknown(trtype);
+#else
+#define TRANSPORT_DEFAULT(trtype)
+#endif
+
+#define TRANSPORT_PCIE(func_name, args) case SPDK_NVME_TRANSPORT_PCIE: return nvme_pcie_ ## func_name args;
+#ifdef SPDK_CONFIG_RDMA
+#define TRANSPORT_FABRICS_RDMA(func_name, args) case SPDK_NVME_TRANSPORT_RDMA: return nvme_rdma_ ## func_name args;
+#define TRANSPORT_RDMA_AVAILABLE true
+#else
+#define TRANSPORT_FABRICS_RDMA(func_name, args) case SPDK_NVME_TRANSPORT_RDMA: SPDK_UNREACHABLE();
+#define TRANSPORT_RDMA_AVAILABLE false
+#endif
+#define TRANSPORT_FABRICS_FC(func_name, args) case SPDK_NVME_TRANSPORT_FC: SPDK_UNREACHABLE();
+#define NVME_TRANSPORT_CALL(trtype, func_name, args) \
+ do { \
+ switch (trtype) { \
+ TRANSPORT_PCIE(func_name, args) \
+ TRANSPORT_FABRICS_RDMA(func_name, args) \
+ TRANSPORT_FABRICS_FC(func_name, args) \
+ TRANSPORT_DEFAULT(trtype) \
+ } \
+ SPDK_UNREACHABLE(); \
+ } while (0)
+
+bool
+spdk_nvme_transport_available(enum spdk_nvme_transport_type trtype)
+{
+ switch (trtype) {
+ case SPDK_NVME_TRANSPORT_PCIE:
+ return true;
+
+ case SPDK_NVME_TRANSPORT_RDMA:
+ return TRANSPORT_RDMA_AVAILABLE;
+
+ case SPDK_NVME_TRANSPORT_FC:
+ return false;
+ }
+
+ return false;
+}
+
+struct spdk_nvme_ctrlr *nvme_transport_ctrlr_construct(const struct spdk_nvme_transport_id *trid,
+ const struct spdk_nvme_ctrlr_opts *opts,
+ void *devhandle)
+{
+ NVME_TRANSPORT_CALL(trid->trtype, ctrlr_construct, (trid, opts, devhandle));
+}
+
+int
+nvme_transport_ctrlr_scan(const struct spdk_nvme_transport_id *trid,
+ void *cb_ctx,
+ spdk_nvme_probe_cb probe_cb,
+ spdk_nvme_remove_cb remove_cb,
+ bool direct_connect)
+{
+ NVME_TRANSPORT_CALL(trid->trtype, ctrlr_scan, (trid, cb_ctx, probe_cb, remove_cb, direct_connect));
+}
+
+int
+nvme_transport_ctrlr_destruct(struct spdk_nvme_ctrlr *ctrlr)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_destruct, (ctrlr));
+}
+
+int
+nvme_transport_ctrlr_enable(struct spdk_nvme_ctrlr *ctrlr)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_enable, (ctrlr));
+}
+
+int
+nvme_transport_ctrlr_set_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t value)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_set_reg_4, (ctrlr, offset, value));
+}
+
+int
+nvme_transport_ctrlr_set_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t value)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_set_reg_8, (ctrlr, offset, value));
+}
+
+int
+nvme_transport_ctrlr_get_reg_4(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint32_t *value)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_reg_4, (ctrlr, offset, value));
+}
+
+int
+nvme_transport_ctrlr_get_reg_8(struct spdk_nvme_ctrlr *ctrlr, uint32_t offset, uint64_t *value)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_reg_8, (ctrlr, offset, value));
+}
+
+uint32_t
+nvme_transport_ctrlr_get_max_xfer_size(struct spdk_nvme_ctrlr *ctrlr)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_max_xfer_size, (ctrlr));
+}
+
+uint16_t
+nvme_transport_ctrlr_get_max_sges(struct spdk_nvme_ctrlr *ctrlr)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_get_max_sges, (ctrlr));
+}
+
+void *
+nvme_transport_ctrlr_alloc_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, size_t size)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_alloc_cmb_io_buffer, (ctrlr, size));
+}
+
+int
+nvme_transport_ctrlr_free_cmb_io_buffer(struct spdk_nvme_ctrlr *ctrlr, void *buf, size_t size)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_free_cmb_io_buffer, (ctrlr, buf, size));
+}
+
+struct spdk_nvme_qpair *
+nvme_transport_ctrlr_create_io_qpair(struct spdk_nvme_ctrlr *ctrlr, uint16_t qid,
+ const struct spdk_nvme_io_qpair_opts *opts)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_create_io_qpair, (ctrlr, qid, opts));
+}
+
+int
+nvme_transport_ctrlr_delete_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_delete_io_qpair, (ctrlr, qpair));
+}
+
+int
+nvme_transport_ctrlr_reinit_io_qpair(struct spdk_nvme_ctrlr *ctrlr, struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(ctrlr->trid.trtype, ctrlr_reinit_io_qpair, (ctrlr, qpair));
+}
+
+int
+nvme_transport_qpair_enable(struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_enable, (qpair));
+}
+
+int
+nvme_transport_qpair_disable(struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_disable, (qpair));
+}
+
+int
+nvme_transport_qpair_reset(struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_reset, (qpair));
+}
+
+int
+nvme_transport_qpair_fail(struct spdk_nvme_qpair *qpair)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_fail, (qpair));
+}
+
+int
+nvme_transport_qpair_submit_request(struct spdk_nvme_qpair *qpair, struct nvme_request *req)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_submit_request, (qpair, req));
+}
+
+int32_t
+nvme_transport_qpair_process_completions(struct spdk_nvme_qpair *qpair, uint32_t max_completions)
+{
+ NVME_TRANSPORT_CALL(qpair->trtype, qpair_process_completions, (qpair, max_completions));
+}
diff --git a/src/spdk/lib/nvme/nvme_uevent.c b/src/spdk/lib/nvme/nvme_uevent.c
new file mode 100644
index 00000000..724cbc5c
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.c
@@ -0,0 +1,214 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "spdk/stdinc.h"
+#include "spdk/string.h"
+
+#include "spdk/log.h"
+#include "spdk/event.h"
+
+#include "nvme_uevent.h"
+
+#ifdef __linux__
+
+#include <linux/netlink.h>
+
+#define SPDK_UEVENT_MSG_LEN 4096
+
+int
+spdk_uevent_connect(void)
+{
+ struct sockaddr_nl addr;
+ int netlink_fd;
+ int size = 64 * 1024;
+ int flag;
+
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+ addr.nl_pid = getpid();
+ addr.nl_groups = 0xffffffff;
+
+ netlink_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
+ if (netlink_fd < 0) {
+ return -1;
+ }
+
+ setsockopt(netlink_fd, SOL_SOCKET, SO_RCVBUFFORCE, &size, sizeof(size));
+
+ flag = fcntl(netlink_fd, F_GETFL);
+ if (fcntl(netlink_fd, F_SETFL, flag | O_NONBLOCK) < 0) {
+ SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", netlink_fd,
+ spdk_strerror(errno));
+ close(netlink_fd);
+ return -1;
+ }
+
+ if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
+ close(netlink_fd);
+ return -1;
+ }
+ return netlink_fd;
+}
+
+/* Note: We only parse the event from uio subsystem and will ignore
+ * all the event from other subsystem. the event from uio subsystem
+ * as below:
+ * action: "add" or "remove"
+ * subsystem: "uio"
+ * dev_path: "/devices/pci0000:80/0000:80:01.0/0000:81:00.0/uio/uio0"
+ */
+static int
+parse_event(const char *buf, struct spdk_uevent *event)
+{
+ char action[SPDK_UEVENT_MSG_LEN];
+ char subsystem[SPDK_UEVENT_MSG_LEN];
+ char dev_path[SPDK_UEVENT_MSG_LEN];
+ char driver[SPDK_UEVENT_MSG_LEN];
+ char vfio_pci_addr[SPDK_UEVENT_MSG_LEN];
+
+ memset(action, 0, SPDK_UEVENT_MSG_LEN);
+ memset(subsystem, 0, SPDK_UEVENT_MSG_LEN);
+ memset(dev_path, 0, SPDK_UEVENT_MSG_LEN);
+ memset(driver, 0, SPDK_UEVENT_MSG_LEN);
+ memset(vfio_pci_addr, 0, SPDK_UEVENT_MSG_LEN);
+
+ while (*buf) {
+ if (!strncmp(buf, "ACTION=", 7)) {
+ buf += 7;
+ snprintf(action, sizeof(action), "%s", buf);
+ } else if (!strncmp(buf, "DEVPATH=", 8)) {
+ buf += 8;
+ snprintf(dev_path, sizeof(dev_path), "%s", buf);
+ } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+ buf += 10;
+ snprintf(subsystem, sizeof(subsystem), "%s", buf);
+ } else if (!strncmp(buf, "DRIVER=", 7)) {
+ buf += 7;
+ snprintf(driver, sizeof(driver), "%s", buf);
+ } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+ buf += 14;
+ snprintf(vfio_pci_addr, sizeof(vfio_pci_addr), "%s", buf);
+ }
+ while (*buf++)
+ ;
+ }
+
+ if (!strncmp(subsystem, "uio", 3)) {
+ char *pci_address, *tmp;
+ struct spdk_pci_addr pci_addr;
+
+ event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_UIO;
+ if (!strncmp(action, "add", 3)) {
+ event->action = SPDK_NVME_UEVENT_ADD;
+ }
+ if (!strncmp(action, "remove", 6)) {
+ event->action = SPDK_NVME_UEVENT_REMOVE;
+ }
+ tmp = strstr(dev_path, "/uio/");
+
+ memset(tmp, 0, SPDK_UEVENT_MSG_LEN - (tmp - dev_path));
+
+ pci_address = strrchr(dev_path, '/');
+ pci_address++;
+ if (spdk_pci_addr_parse(&pci_addr, pci_address) != 0) {
+ SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", pci_address);
+ return -1;
+ }
+ spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+ return 1;
+ }
+ if (!strncmp(driver, "vfio-pci", 8)) {
+ struct spdk_pci_addr pci_addr;
+
+ event->subsystem = SPDK_NVME_UEVENT_SUBSYSTEM_VFIO;
+ if (!strncmp(action, "add", 3)) {
+ event->action = SPDK_NVME_UEVENT_ADD;
+ }
+ if (!strncmp(action, "remove", 6)) {
+ event->action = SPDK_NVME_UEVENT_REMOVE;
+ }
+ if (spdk_pci_addr_parse(&pci_addr, vfio_pci_addr) != 0) {
+ SPDK_ERRLOG("Invalid format for NVMe BDF: %s\n", vfio_pci_addr);
+ return -1;
+ }
+ spdk_pci_addr_fmt(event->traddr, sizeof(event->traddr), &pci_addr);
+ return 1;
+
+ }
+ return -1;
+}
+
+int
+spdk_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+ int ret;
+ char buf[SPDK_UEVENT_MSG_LEN];
+
+ memset(uevent, 0, sizeof(struct spdk_uevent));
+ memset(buf, 0, SPDK_UEVENT_MSG_LEN);
+
+ ret = recv(fd, buf, SPDK_UEVENT_MSG_LEN - 1, MSG_DONTWAIT);
+ if (ret > 0) {
+ return parse_event(buf, uevent);
+ }
+
+ if (ret < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ return 0;
+ } else {
+ SPDK_ERRLOG("Socket read error(%d): %s\n", errno, spdk_strerror(errno));
+ return -1;
+ }
+ }
+
+ /* connection closed */
+ if (ret == 0) {
+ return -1;
+ }
+ return 0;
+}
+
+#else /* Not Linux */
+
+int
+spdk_uevent_connect(void)
+{
+ return -1;
+}
+
+int
+spdk_get_uevent(int fd, struct spdk_uevent *uevent)
+{
+ return -1;
+}
+#endif
diff --git a/src/spdk/lib/nvme/nvme_uevent.h b/src/spdk/lib/nvme/nvme_uevent.h
new file mode 100644
index 00000000..7fe0ab7a
--- /dev/null
+++ b/src/spdk/lib/nvme/nvme_uevent.h
@@ -0,0 +1,61 @@
+/*-
+ * BSD LICENSE
+ *
+ * Copyright (c) Intel Corporation.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * SPDK uevent
+ */
+
+#include "spdk/env.h"
+#include "spdk/nvmf_spec.h"
+
+#ifndef SPDK_UEVENT_H_
+#define SPDK_UEVENT_H_
+
+#define SPDK_NVME_UEVENT_SUBSYSTEM_UIO 1
+#define SPDK_NVME_UEVENT_SUBSYSTEM_VFIO 2
+
+enum spdk_nvme_uevent_action {
+ SPDK_NVME_UEVENT_ADD = 0,
+ SPDK_NVME_UEVENT_REMOVE = 1,
+};
+
+struct spdk_uevent {
+ enum spdk_nvme_uevent_action action;
+ int subsystem;
+ char traddr[SPDK_NVMF_TRADDR_MAX_LEN + 1];
+};
+
+int spdk_uevent_connect(void);
+int spdk_get_uevent(int fd, struct spdk_uevent *uevent);
+
+#endif /* SPDK_UEVENT_H_ */