diff options
Diffstat (limited to '')
-rw-r--r-- | src/spdk/lib/nvmf/Makefile | 63 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/ctrlr.c | 1773 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/ctrlr_bdev.c | 531 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/ctrlr_discovery.c | 144 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/nvmf.c | 1173 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/nvmf_fc.h | 871 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/nvmf_internal.h | 333 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/rdma.c | 2930 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/request.c | 190 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/subsystem.c | 1269 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/transport.c | 236 | ||||
-rw-r--r-- | src/spdk/lib/nvmf/transport.h | 200 |
12 files changed, 9713 insertions, 0 deletions
diff --git a/src/spdk/lib/nvmf/Makefile b/src/spdk/lib/nvmf/Makefile new file mode 100644 index 00000000..8f299a90 --- /dev/null +++ b/src/spdk/lib/nvmf/Makefile @@ -0,0 +1,63 @@ +# +# BSD LICENSE +# +# Copyright (c) Intel Corporation. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +SPDK_ROOT_DIR := $(abspath $(CURDIR)/../..) +include $(SPDK_ROOT_DIR)/mk/spdk.common.mk + +C_SRCS = ctrlr.c ctrlr_discovery.c ctrlr_bdev.c \ + subsystem.c nvmf.c \ + request.c transport.c + +C_SRCS-$(CONFIG_RDMA) += rdma.c +LIBNAME = nvmf +LOCAL_SYS_LIBS = -luuid +ifeq ($(CONFIG_RDMA),y) +LOCAL_SYS_LIBS += -libverbs -lrdmacm +#Attach only if FreeBSD and RDMA is specified with configure +ifeq ($(OS),FreeBSD) +# Mellanox - MLX4 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx4.*)","") +LOCAL_SYS_LIBS += -lmlx4 +endif +# Mellanox - MLX5 HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libmlx5.*)","") +LOCAL_SYS_LIBS += -lmlx5 +endif +# Chelsio HBA Userspace Library +ifneq ("$(wildcard /usr/lib/libcxgb4.*)","") +LOCAL_SYS_LIBS += -lcxgb4 +endif +endif +endif + +include $(SPDK_ROOT_DIR)/mk/spdk.lib.mk diff --git a/src/spdk/lib/nvmf/ctrlr.c b/src/spdk/lib/nvmf/ctrlr.c new file mode 100644 index 00000000..ed5e68f0 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr.c @@ -0,0 +1,1773 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/bit_array.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/trace.h" +#include "spdk/nvme_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/version.h" + +#include "spdk_internal/log.h" + +#define MIN_KEEP_ALIVE_TIMEOUT 10000 + +#define MODEL_NUMBER "SPDK bdev Controller" + +/* + * Report the SPDK version as the firmware revision. + * SPDK_VERSION_STRING won't fit into FR (only 8 bytes), so try to fit the most important parts. + */ +#define FW_VERSION SPDK_VERSION_MAJOR_STRING SPDK_VERSION_MINOR_STRING SPDK_VERSION_PATCH_STRING + +static inline void +spdk_nvmf_invalid_connect_response(struct spdk_nvmf_fabric_connect_rsp *rsp, + uint8_t iattr, uint16_t ipo) +{ + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + rsp->status_code_specific.invalid.iattr = iattr; + rsp->status_code_specific.invalid.ipo = ipo; +} + +#define SPDK_NVMF_INVALID_CONNECT_CMD(rsp, field) \ + spdk_nvmf_invalid_connect_response(rsp, 0, offsetof(struct spdk_nvmf_fabric_connect_cmd, field)) +#define SPDK_NVMF_INVALID_CONNECT_DATA(rsp, field) \ + spdk_nvmf_invalid_connect_response(rsp, 1, offsetof(struct spdk_nvmf_fabric_connect_data, field)) + +static void +ctrlr_add_qpair_and_update_rsp(struct spdk_nvmf_qpair *qpair, + struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvmf_fabric_connect_rsp *rsp) +{ + assert(ctrlr->admin_qpair->group->thread == spdk_get_thread()); + + /* check if we would exceed ctrlr connection limit */ + if (qpair->qid >= spdk_bit_array_capacity(ctrlr->qpair_mask)) { + SPDK_ERRLOG("Requested QID %u but Max QID is %u\n", + qpair->qid, spdk_bit_array_capacity(ctrlr->qpair_mask) - 1); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + if (spdk_bit_array_get(ctrlr->qpair_mask, qpair->qid)) { + SPDK_ERRLOG("Got I/O connect with duplicate QID %u\n", qpair->qid); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + return; + } + + qpair->ctrlr = ctrlr; + spdk_bit_array_set(ctrlr->qpair_mask, qpair->qid); + + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + rsp->status_code_specific.success.cntlid = ctrlr->cntlid; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "connect capsule response: cntlid = 0x%04x\n", + rsp->status_code_specific.success.cntlid); +} + +static void +_spdk_nvmf_request_complete(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + + spdk_nvmf_request_complete(req); +} + +static void +_spdk_nvmf_ctrlr_add_admin_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + ctrlr->admin_qpair = qpair; + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); + spdk_nvmf_request_complete(req); +} + +static void +_spdk_nvmf_subsystem_add_ctrlr(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (spdk_nvmf_subsystem_add_ctrlr(ctrlr->subsys, ctrlr)) { + SPDK_ERRLOG("Unable to add controller to subsystem\n"); + free(ctrlr); + qpair->ctrlr = NULL; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); + return; + } + + spdk_thread_send_msg(ctrlr->thread, _spdk_nvmf_ctrlr_add_admin_qpair, req); +} + +static struct spdk_nvmf_ctrlr * +spdk_nvmf_ctrlr_create(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_request *req, + struct spdk_nvmf_fabric_connect_cmd *connect_cmd, + struct spdk_nvmf_fabric_connect_data *connect_data) +{ + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_transport *transport; + + ctrlr = calloc(1, sizeof(*ctrlr)); + if (ctrlr == NULL) { + SPDK_ERRLOG("Memory allocation failed\n"); + return NULL; + } + + req->qpair->ctrlr = ctrlr; + ctrlr->subsys = subsystem; + ctrlr->thread = req->qpair->group->thread; + + transport = req->qpair->transport; + ctrlr->qpair_mask = spdk_bit_array_create(transport->opts.max_qpairs_per_ctrlr); + if (!ctrlr->qpair_mask) { + SPDK_ERRLOG("Failed to allocate controller qpair mask\n"); + free(ctrlr); + return NULL; + } + + ctrlr->feat.keep_alive_timer.bits.kato = connect_cmd->kato; + ctrlr->feat.async_event_configuration.bits.ns_attr_notice = 1; + ctrlr->feat.volatile_write_cache.bits.wce = 1; + + /* Subtract 1 for admin queue, 1 for 0's based */ + ctrlr->feat.number_of_queues.bits.ncqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + ctrlr->feat.number_of_queues.bits.nsqr = transport->opts.max_qpairs_per_ctrlr - 1 - + 1; + + memcpy(ctrlr->hostid, connect_data->hostid, sizeof(ctrlr->hostid)); + + ctrlr->vcprop.cap.raw = 0; + ctrlr->vcprop.cap.bits.cqr = 1; /* NVMe-oF specification required */ + ctrlr->vcprop.cap.bits.mqes = transport->opts.max_queue_depth - + 1; /* max queue depth */ + ctrlr->vcprop.cap.bits.ams = 0; /* optional arb mechanisms */ + ctrlr->vcprop.cap.bits.to = 1; /* ready timeout - 500 msec units */ + ctrlr->vcprop.cap.bits.dstrd = 0; /* fixed to 0 for NVMe-oF */ + ctrlr->vcprop.cap.bits.css = SPDK_NVME_CAP_CSS_NVM; /* NVM command set */ + ctrlr->vcprop.cap.bits.mpsmin = 0; /* 2 ^ (12 + mpsmin) == 4k */ + ctrlr->vcprop.cap.bits.mpsmax = 0; /* 2 ^ (12 + mpsmax) == 4k */ + + /* Version Supported: 1.3 */ + ctrlr->vcprop.vs.bits.mjr = 1; + ctrlr->vcprop.vs.bits.mnr = 3; + ctrlr->vcprop.vs.bits.ter = 0; + + ctrlr->vcprop.cc.raw = 0; + ctrlr->vcprop.cc.bits.en = 0; /* Init controller disabled */ + + ctrlr->vcprop.csts.raw = 0; + ctrlr->vcprop.csts.bits.rdy = 0; /* Init controller as not ready */ + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cap 0x%" PRIx64 "\n", ctrlr->vcprop.cap.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "vs 0x%x\n", ctrlr->vcprop.vs.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cc 0x%x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "csts 0x%x\n", ctrlr->vcprop.csts.raw); + + spdk_thread_send_msg(subsystem->thread, _spdk_nvmf_subsystem_add_ctrlr, req); + + return ctrlr; +} + +void +spdk_nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr) +{ + spdk_nvmf_subsystem_remove_ctrlr(ctrlr->subsys, ctrlr); + + free(ctrlr); +} + +static void +spdk_nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + /* Unit test will check qpair->ctrlr after calling spdk_nvmf_ctrlr_connect. + * For error case, the value should be NULL. So set it to NULL at first. + */ + qpair->ctrlr = NULL; + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + SPDK_ERRLOG("I/O connect not allowed on discovery controller\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (!ctrlr->vcprop.cc.bits.en) { + SPDK_ERRLOG("Got I/O connect before ctrlr was enabled\n"); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iosqes != sizeof(struct spdk_nvme_cmd)) { + SPDK_ERRLOG("Got I/O connect with invalid IOSQES %u\n", + ctrlr->vcprop.cc.bits.iosqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + if (1u << ctrlr->vcprop.cc.bits.iocqes != sizeof(struct spdk_nvme_cpl)) { + SPDK_ERRLOG("Got I/O connect with invalid IOCQES %u\n", + ctrlr->vcprop.cc.bits.iocqes); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, qid); + goto end; + } + + ctrlr_add_qpair_and_update_rsp(qpair, ctrlr, rsp); + +end: + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); +} + +static void +_spdk_nvmf_ctrlr_add_io_qpair(void *ctx) +{ + struct spdk_nvmf_request *req = ctx; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; + struct spdk_nvmf_subsystem *subsystem; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect I/O Queue for controller id 0x%x\n", data->cntlid); + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, data->subnqn); + /* We already checked this in spdk_nvmf_ctrlr_connect */ + assert(subsystem != NULL); + + ctrlr = spdk_nvmf_subsystem_get_ctrlr(subsystem, data->cntlid); + if (ctrlr == NULL) { + SPDK_ERRLOG("Unknown controller ID 0x%x\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + spdk_thread_send_msg(qpair->group->thread, _spdk_nvmf_request_complete, req); + return; + } + + admin_qpair = ctrlr->admin_qpair; + qpair->ctrlr = ctrlr; + spdk_thread_send_msg(admin_qpair->group->thread, spdk_nvmf_ctrlr_add_io_qpair, req); +} + +static int +spdk_nvmf_ctrlr_connect(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_fabric_connect_data *data = req->data; + struct spdk_nvmf_fabric_connect_cmd *cmd = &req->cmd->connect_cmd; + struct spdk_nvmf_fabric_connect_rsp *rsp = &req->rsp->connect_rsp; + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_tgt *tgt = qpair->transport->tgt; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_subsystem *subsystem; + const char *subnqn, *hostnqn; + struct spdk_nvme_transport_id listen_trid = {}; + void *end; + + if (req->length < sizeof(struct spdk_nvmf_fabric_connect_data)) { + SPDK_ERRLOG("Connect command data length 0x%x too small\n", req->length); + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "recfmt 0x%x qid %u sqsize %u\n", + cmd->recfmt, cmd->qid, cmd->sqsize); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect data:\n"); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " cntlid: 0x%04x\n", data->cntlid); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostid: %08x-%04x-%04x-%02x%02x-%04x%08x ***\n", + ntohl(*(uint32_t *)&data->hostid[0]), + ntohs(*(uint16_t *)&data->hostid[4]), + ntohs(*(uint16_t *)&data->hostid[6]), + data->hostid[8], + data->hostid[9], + ntohs(*(uint16_t *)&data->hostid[10]), + ntohl(*(uint32_t *)&data->hostid[12])); + + if (cmd->recfmt != 0) { + SPDK_ERRLOG("Connect command unsupported RECFMT %u\n", cmd->recfmt); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INCOMPATIBLE_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Ensure that subnqn is null terminated */ + end = memchr(data->subnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Connect SUBNQN is not null terminated\n"); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + subnqn = data->subnqn; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " subnqn: \"%s\"\n", subnqn); + + subsystem = spdk_nvmf_tgt_find_subsystem(tgt, subnqn); + if (subsystem == NULL) { + SPDK_ERRLOG("Could not find subsystem '%s'\n", subnqn); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, subnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Ensure that hostnqn is null terminated */ + end = memchr(data->hostnqn, '\0', SPDK_NVMF_NQN_MAX_LEN + 1); + if (!end) { + SPDK_ERRLOG("Connect HOSTNQN is not null terminated\n"); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, hostnqn); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + hostnqn = data->hostnqn; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, " hostnqn: \"%s\"\n", hostnqn); + + if (!spdk_nvmf_subsystem_host_allowed(subsystem, hostnqn)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s'\n", subnqn, hostnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_nvmf_qpair_get_listen_trid(qpair, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' is unable to enforce access control due to an internal error.\n", + subnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (!spdk_nvmf_subsystem_listener_allowed(subsystem, &listen_trid)) { + SPDK_ERRLOG("Subsystem '%s' does not allow host '%s' to connect at this address.\n", subnqn, + hostnqn); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_HOST; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* + * SQSIZE is a 0-based value, so it must be at least 1 (minimum queue depth is 2) and + * strictly less than max_queue_depth. + */ + if (cmd->sqsize == 0 || cmd->sqsize >= qpair->transport->opts.max_queue_depth) { + SPDK_ERRLOG("Invalid SQSIZE %u (min 1, max %u)\n", + cmd->sqsize, qpair->transport->opts.max_queue_depth - 1); + SPDK_NVMF_INVALID_CONNECT_CMD(rsp, sqsize); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + qpair->sq_head_max = cmd->sqsize; + qpair->qid = cmd->qid; + + if (cmd->qid == 0) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Connect Admin Queue for controller ID 0x%x\n", data->cntlid); + + if (data->cntlid != 0xFFFF) { + /* This NVMf target only supports dynamic mode. */ + SPDK_ERRLOG("The NVMf target only supports dynamic mode (CNTLID = 0x%x).\n", data->cntlid); + SPDK_NVMF_INVALID_CONNECT_DATA(rsp, cntlid); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* Establish a new ctrlr */ + ctrlr = spdk_nvmf_ctrlr_create(subsystem, req, cmd, data); + if (!ctrlr) { + SPDK_ERRLOG("spdk_nvmf_ctrlr_create() failed\n"); + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } else { + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + } else { + spdk_thread_send_msg(subsystem->thread, _spdk_nvmf_ctrlr_add_io_qpair, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } +} + +static uint64_t +nvmf_prop_get_cap(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cap.raw; +} + +static uint64_t +nvmf_prop_get_vs(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.vs.raw; +} + +static uint64_t +nvmf_prop_get_cc(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.cc.raw; +} + +static bool +nvmf_prop_set_cc(struct spdk_nvmf_ctrlr *ctrlr, uint64_t value) +{ + union spdk_nvme_cc_register cc, diff; + + cc.raw = (uint32_t)value; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cur CC: 0x%08x\n", ctrlr->vcprop.cc.raw); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "new CC: 0x%08x\n", cc.raw); + + /* + * Calculate which bits changed between the current and new CC. + * Mark each bit as 0 once it is handled to determine if any unhandled bits were changed. + */ + diff.raw = cc.raw ^ ctrlr->vcprop.cc.raw; + + if (diff.bits.en) { + if (cc.bits.en) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Enable!\n"); + ctrlr->vcprop.cc.bits.en = 1; + ctrlr->vcprop.csts.bits.rdy = 1; + } else { + SPDK_ERRLOG("CC.EN transition from 1 to 0 (reset) not implemented!\n"); + + } + diff.bits.en = 0; + } + + if (diff.bits.shn) { + if (cc.bits.shn == SPDK_NVME_SHN_NORMAL || + cc.bits.shn == SPDK_NVME_SHN_ABRUPT) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Property Set CC Shutdown %u%ub!\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + ctrlr->vcprop.cc.bits.shn = cc.bits.shn; + ctrlr->vcprop.cc.bits.en = 0; + ctrlr->vcprop.csts.bits.rdy = 0; + ctrlr->vcprop.csts.bits.shst = SPDK_NVME_SHST_COMPLETE; + } else if (cc.bits.shn == 0) { + ctrlr->vcprop.cc.bits.shn = 0; + } else { + SPDK_ERRLOG("Prop Set CC: Invalid SHN value %u%ub\n", + cc.bits.shn >> 1, cc.bits.shn & 1); + return false; + } + diff.bits.shn = 0; + } + + if (diff.bits.iosqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOSQES = %u (%u bytes)\n", + cc.bits.iosqes, 1u << cc.bits.iosqes); + ctrlr->vcprop.cc.bits.iosqes = cc.bits.iosqes; + diff.bits.iosqes = 0; + } + + if (diff.bits.iocqes) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Prop Set IOCQES = %u (%u bytes)\n", + cc.bits.iocqes, 1u << cc.bits.iocqes); + ctrlr->vcprop.cc.bits.iocqes = cc.bits.iocqes; + diff.bits.iocqes = 0; + } + + if (diff.raw != 0) { + SPDK_ERRLOG("Prop Set CC toggled reserved bits 0x%x!\n", diff.raw); + return false; + } + + return true; +} + +static uint64_t +nvmf_prop_get_csts(struct spdk_nvmf_ctrlr *ctrlr) +{ + return ctrlr->vcprop.csts.raw; +} + +struct nvmf_prop { + uint32_t ofst; + uint8_t size; + char name[11]; + uint64_t (*get_cb)(struct spdk_nvmf_ctrlr *ctrlr); + bool (*set_cb)(struct spdk_nvmf_ctrlr *ctrlr, uint64_t value); +}; + +#define PROP(field, size, get_cb, set_cb) \ + { \ + offsetof(struct spdk_nvme_registers, field), \ + SPDK_NVMF_PROP_SIZE_##size, \ + #field, \ + get_cb, set_cb \ + } + +static const struct nvmf_prop nvmf_props[] = { + PROP(cap, 8, nvmf_prop_get_cap, NULL), + PROP(vs, 4, nvmf_prop_get_vs, NULL), + PROP(cc, 4, nvmf_prop_get_cc, nvmf_prop_set_cc), + PROP(csts, 4, nvmf_prop_get_csts, NULL), +}; + +static const struct nvmf_prop * +find_prop(uint32_t ofst) +{ + size_t i; + + for (i = 0; i < SPDK_COUNTOF(nvmf_props); i++) { + const struct nvmf_prop *prop = &nvmf_props[i]; + + if (prop->ofst == ofst) { + return prop; + } + } + + return NULL; +} + +static int +spdk_nvmf_property_get(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_get_cmd *cmd = &req->cmd->prop_get_cmd; + struct spdk_nvmf_fabric_prop_get_rsp *response = &req->rsp->prop_get_rsp; + const struct nvmf_prop *prop; + + response->status.sc = 0; + response->value.u64 = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x\n", + cmd->attrib.size, cmd->ofst); + + if (cmd->attrib.size != SPDK_NVMF_PROP_SIZE_4 && + cmd->attrib.size != SPDK_NVMF_PROP_SIZE_8) { + SPDK_ERRLOG("Invalid size value %d\n", cmd->attrib.size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + prop = find_prop(cmd->ofst); + if (prop == NULL || prop->get_cb == NULL) { + /* Reserved properties return 0 when read */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + if (cmd->attrib.size != prop->size) { + SPDK_ERRLOG("offset 0x%x size mismatch: cmd %u, prop %u\n", + cmd->ofst, cmd->attrib.size, prop->size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + response->value.u64 = prop->get_cb(ctrlr); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "response value: 0x%" PRIx64 "\n", response->value.u64); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_property_set(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_fabric_prop_set_cmd *cmd = &req->cmd->prop_set_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + const struct nvmf_prop *prop; + uint64_t value; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "size %d, offset 0x%x, value 0x%" PRIx64 "\n", + cmd->attrib.size, cmd->ofst, cmd->value.u64); + + prop = find_prop(cmd->ofst); + if (prop == NULL || prop->set_cb == NULL) { + SPDK_ERRLOG("Invalid offset 0x%x\n", cmd->ofst); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "name: %s\n", prop->name); + if (cmd->attrib.size != prop->size) { + SPDK_ERRLOG("offset 0x%x size mismatch: cmd %u, prop %u\n", + cmd->ofst, cmd->attrib.size, prop->size); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + value = cmd->value.u64; + if (prop->size == SPDK_NVMF_PROP_SIZE_4) { + value = (uint32_t)value; + } + + if (!prop->set_cb(ctrlr, value)) { + SPDK_ERRLOG("prop set_cb failed\n"); + response->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + response->status.sc = SPDK_NVMF_FABRIC_SC_INVALID_PARAM; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_arbitration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Arbitration (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.arbitration.raw = cmd->cdw11; + ctrlr->feat.arbitration.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_power_management(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_power_management opts; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Power Management (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + /* Only PS = 0 is allowed, since we report NPSS = 0 */ + if (opts.bits.ps != 0) { + SPDK_ERRLOG("Invalid power state %u\n", opts.bits.ps); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.power_management.raw = cmd->cdw11; + ctrlr->feat.power_management.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static bool +temp_threshold_opts_valid(const union spdk_nvme_feat_temperature_threshold *opts) +{ + /* + * Valid TMPSEL values: + * 0000b - 1000b: temperature sensors + * 1111b: set all implemented temperature sensors + */ + if (opts->bits.tmpsel >= 9 && opts->bits.tmpsel != 15) { + /* 1001b - 1110b: reserved */ + SPDK_ERRLOG("Invalid TMPSEL %u\n", opts->bits.tmpsel); + return false; + } + + /* + * Valid THSEL values: + * 00b: over temperature threshold + * 01b: under temperature threshold + */ + if (opts->bits.thsel > 1) { + /* 10b - 11b: reserved */ + SPDK_ERRLOG("Invalid THSEL %u\n", opts->bits.thsel); + return false; + } + + return true; +} + +static int +spdk_nvmf_ctrlr_set_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_temperature_threshold opts; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (!temp_threshold_opts_valid(&opts)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - ignore new values */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features_temperature_threshold(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_temperature_threshold opts; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Temperature Threshold (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (!temp_threshold_opts_valid(&opts)) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + /* TODO: no sensors implemented - return 0 for all thresholds */ + rsp->cdw0 = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_error_recovery(struct spdk_nvmf_request *req) +{ + union spdk_nvme_feat_error_recovery opts; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Error Recovery (cdw11 = 0x%0x)\n", cmd->cdw11); + opts.raw = cmd->cdw11; + + if (opts.bits.dulbe) { + /* + * Host is not allowed to set this bit, since we don't advertise it in + * Identify Namespace. + */ + SPDK_ERRLOG("Host set unsupported DULBE bit\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->feat.error_recovery.raw = cmd->cdw11; + ctrlr->feat.error_recovery.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_volatile_write_cache(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.volatile_write_cache.raw = cmd->cdw11; + ctrlr->feat.volatile_write_cache.bits.reserved = 0; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Volatile Write Cache %s\n", + ctrlr->feat.volatile_write_cache.bits.wce ? "Enabled" : "Disabled"); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_write_atomicity(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Write Atomicity (cdw11 = 0x%0x)\n", cmd->cdw11); + + ctrlr->feat.write_atomicity.raw = cmd->cdw11; + ctrlr->feat.write_atomicity.bits.reserved = 0; + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + SPDK_ERRLOG("Set Features - Host Identifier not allowed\n"); + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features_host_identifier(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + union spdk_nvme_feat_host_identifier opts; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get Features - Host Identifier\n"); + + opts.raw = cmd->cdw11; + if (!opts.bits.exhid) { + /* NVMe over Fabrics requires EXHID=1 (128-bit/16-byte host ID) */ + SPDK_ERRLOG("Get Features - Host Identifier with EXHID=0 not allowed\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data == NULL || req->length < sizeof(ctrlr->hostid)) { + SPDK_ERRLOG("Invalid data buffer for Get Features - Host Identifier\n"); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + memcpy(req->data, ctrlr->hostid, sizeof(ctrlr->hostid)); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_keep_alive_timer(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer (%u ms)\n", cmd->cdw11); + + if (cmd->cdw11 == 0) { + rsp->status.sc = SPDK_NVME_SC_KEEP_ALIVE_INVALID; + } else if (cmd->cdw11 < MIN_KEEP_ALIVE_TIMEOUT) { + ctrlr->feat.keep_alive_timer.bits.kato = MIN_KEEP_ALIVE_TIMEOUT; + } else { + ctrlr->feat.keep_alive_timer.bits.kato = cmd->cdw11; + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Keep Alive Timer set to %u ms\n", + ctrlr->feat.keep_alive_timer.bits.kato); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_number_of_queues(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint32_t count; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Number of Queues, cdw11 0x%x\n", + req->cmd->nvme_cmd.cdw11); + + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + /* verify that the controller is ready to process commands */ + if (count > 1) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Queue pairs already active!\n"); + rsp->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + } else { + /* + * Ignore the value requested by the host - + * always return the pre-configured value based on max_qpairs_allowed. + */ + rsp->cdw0 = ctrlr->feat.number_of_queues.raw; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_set_features_async_event_configuration(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Set Features - Async Event Configuration, cdw11 0x%08x\n", + cmd->cdw11); + ctrlr->feat.async_event_configuration.raw = cmd->cdw11; + ctrlr->feat.async_event_configuration.bits.reserved = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_async_event_request(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Async Event Request\n"); + + /* Only one asynchronous event is supported for now */ + if (ctrlr->aer_req != NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "AERL exceeded\n"); + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + rsp->cdw0 = ctrlr->notice_event.raw; + ctrlr->notice_event.raw = 0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ctrlr->aer_req = req; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static void +spdk_nvmf_get_firmware_slot_log_page(void *buffer, uint64_t offset, uint32_t length) +{ + struct spdk_nvme_firmware_page fw_page; + size_t copy_len; + + memset(&fw_page, 0, sizeof(fw_page)); + fw_page.afi.active_slot = 1; + fw_page.afi.next_reset_slot = 0; + spdk_strcpy_pad(fw_page.revision[0], FW_VERSION, sizeof(fw_page.revision[0]), ' '); + + if (offset < sizeof(fw_page)) { + copy_len = spdk_min(sizeof(fw_page) - offset, length); + if (copy_len > 0) { + memcpy(buffer, (const char *)&fw_page + offset, copy_len); + } + } +} + +void +spdk_nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid) +{ + uint16_t max_changes = SPDK_COUNTOF(ctrlr->changed_ns_list.ns_list); + uint16_t i; + bool found = false; + + for (i = 0; i < ctrlr->changed_ns_list_count; i++) { + if (ctrlr->changed_ns_list.ns_list[i] == nsid) { + /* nsid is already in the list */ + found = true; + break; + } + } + + if (!found) { + if (ctrlr->changed_ns_list_count == max_changes) { + /* Out of space - set first entry to FFFFFFFFh and zero-fill the rest. */ + ctrlr->changed_ns_list.ns_list[0] = 0xFFFFFFFFu; + for (i = 1; i < max_changes; i++) { + ctrlr->changed_ns_list.ns_list[i] = 0; + } + } else { + ctrlr->changed_ns_list.ns_list[ctrlr->changed_ns_list_count++] = nsid; + } + } + + spdk_nvmf_ctrlr_async_event_ns_notice(ctrlr); +} + +static void +spdk_nvmf_get_changed_ns_list_log_page(struct spdk_nvmf_ctrlr *ctrlr, + void *buffer, uint64_t offset, uint32_t length) +{ + size_t copy_length; + + if (offset < sizeof(ctrlr->changed_ns_list)) { + copy_length = spdk_min(length, sizeof(ctrlr->changed_ns_list) - offset); + if (copy_length) { + memcpy(buffer, (char *)&ctrlr->changed_ns_list + offset, copy_length); + } + } + + /* Clear log page each time it is read */ + ctrlr->changed_ns_list_count = 0; + memset(&ctrlr->changed_ns_list, 0, sizeof(ctrlr->changed_ns_list)); +} + +/* The structure can be modified if we provide support for other commands in future */ +static const struct spdk_nvme_cmds_and_effect_log_page g_cmds_and_effect_log_page = { + .admin_cmds_supported = { + /* CSUPP, LBCC, NCC, NIC, CCC, CSE */ + /* Get Log Page */ + [SPDK_NVME_OPC_GET_LOG_PAGE] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Identify */ + [SPDK_NVME_OPC_IDENTIFY] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Abort */ + [SPDK_NVME_OPC_ABORT] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Set Features */ + [SPDK_NVME_OPC_SET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Get Features */ + [SPDK_NVME_OPC_GET_FEATURES] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Async Event Request */ + [SPDK_NVME_OPC_ASYNC_EVENT_REQUEST] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* Keep Alive */ + [SPDK_NVME_OPC_KEEP_ALIVE] = {1, 0, 0, 0, 0, 0, 0, 0}, + }, + .io_cmds_supported = { + /* FLUSH */ + [SPDK_NVME_OPC_FLUSH] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* WRITE */ + [SPDK_NVME_OPC_WRITE] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* READ */ + [SPDK_NVME_OPC_READ] = {1, 0, 0, 0, 0, 0, 0, 0}, + /* WRITE ZEROES */ + [SPDK_NVME_OPC_WRITE_ZEROES] = {1, 1, 0, 0, 0, 0, 0, 0}, + /* DATASET MANAGEMENT */ + [SPDK_NVME_OPC_DATASET_MANAGEMENT] = {1, 1, 0, 0, 0, 0, 0, 0}, + }, +}; + +static void +spdk_nvmf_get_cmds_and_effects_log_page(void *buffer, + uint64_t offset, uint32_t length) +{ + uint32_t page_size = sizeof(struct spdk_nvme_cmds_and_effect_log_page); + size_t copy_len = 0; + size_t zero_len = length; + + if (offset < page_size) { + copy_len = spdk_min(page_size - offset, length); + zero_len -= copy_len; + memcpy(buffer, (char *)(&g_cmds_and_effect_log_page) + offset, copy_len); + } + + if (zero_len) { + memset((char *)buffer + copy_len, 0, zero_len); + } +} + +static int +spdk_nvmf_ctrlr_get_log_page(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + uint64_t offset, len; + uint32_t numdl, numdu; + uint8_t lid; + + if (req->data == NULL) { + SPDK_ERRLOG("get log command with no buffer\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + offset = (uint64_t)cmd->cdw12 | ((uint64_t)cmd->cdw13 << 32); + if (offset & 3) { + SPDK_ERRLOG("Invalid log page offset 0x%" PRIx64 "\n", offset); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + numdl = (cmd->cdw10 >> 16) & 0xFFFFu; + numdu = (cmd->cdw11) & 0xFFFFu; + len = ((numdu << 16) + numdl + (uint64_t)1) * 4; + if (len > req->length) { + SPDK_ERRLOG("Get log page: len (%" PRIu64 ") > buf size (%u)\n", + len, req->length); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + lid = cmd->cdw10 & 0xFF; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Get log page: LID=0x%02X offset=0x%" PRIx64 " len=0x%" PRIx64 "\n", + lid, offset, len); + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + switch (lid) { + case SPDK_NVME_LOG_DISCOVERY: + spdk_nvmf_get_discovery_log_page(subsystem->tgt, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } else { + switch (lid) { + case SPDK_NVME_LOG_ERROR: + case SPDK_NVME_LOG_HEALTH_INFORMATION: + /* TODO: actually fill out log page data */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_FIRMWARE_SLOT: + spdk_nvmf_get_firmware_slot_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_COMMAND_EFFECTS_LOG: + spdk_nvmf_get_cmds_and_effects_log_page(req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + case SPDK_NVME_LOG_CHANGED_NS_LIST: + spdk_nvmf_get_changed_ns_list_log_page(ctrlr, req->data, offset, len); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + default: + goto invalid_log_page; + } + } + +invalid_log_page: + SPDK_ERRLOG("Unsupported Get Log Page 0x%02X\n", lid); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_ns(struct spdk_nvmf_ctrlr *ctrlr, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_data *nsdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_ns *ns; + uint32_t max_num_blocks; + + if (cmd->nsid == 0 || cmd->nsid > subsystem->max_nsid) { + SPDK_ERRLOG("Identify Namespace for invalid NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _spdk_nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + /* + * Inactive namespaces should return a zero filled data structure. + * The data buffer is already zeroed by spdk_nvmf_ctrlr_process_admin_cmd(), + * so we can just return early here. + */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Identify Namespace for inactive NSID %u\n", cmd->nsid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + spdk_nvmf_bdev_ctrlr_identify_ns(ns, nsdata); + + /* Due to bug in the Linux kernel NVMe driver we have to set noiob no larger than mdts */ + max_num_blocks = ctrlr->admin_qpair->transport->opts.max_io_size / + (1U << nsdata->lbaf[nsdata->flbas.format].lbads); + if (nsdata->noiob > max_num_blocks) { + nsdata->noiob = max_num_blocks; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_ctrlr(struct spdk_nvmf_ctrlr *ctrlr, struct spdk_nvme_ctrlr_data *cdata) +{ + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + struct spdk_nvmf_transport *transport = ctrlr->admin_qpair->transport; + + /* + * Common fields for discovery and NVM subsystems + */ + spdk_strcpy_pad(cdata->fr, FW_VERSION, sizeof(cdata->fr), ' '); + assert((transport->opts.max_io_size % 4096) == 0); + cdata->mdts = spdk_u32log2(transport->opts.max_io_size / 4096); + cdata->cntlid = ctrlr->cntlid; + cdata->ver = ctrlr->vcprop.vs; + cdata->lpa.edlp = 1; + cdata->elpe = 127; + cdata->maxcmd = transport->opts.max_queue_depth; + cdata->sgls.supported = 1; + cdata->sgls.keyed_sgl = 1; + cdata->sgls.sgl_offset = 1; + spdk_strcpy_pad(cdata->subnqn, subsystem->subnqn, sizeof(cdata->subnqn), '\0'); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ctrlr data: maxcmd 0x%x\n", cdata->maxcmd); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "sgls data: 0x%x\n", from_le32(&cdata->sgls)); + + /* + * NVM subsystem fields (reserved for discovery subsystems) + */ + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_NVME) { + spdk_strcpy_pad(cdata->mn, MODEL_NUMBER, sizeof(cdata->mn), ' '); + spdk_strcpy_pad(cdata->sn, spdk_nvmf_subsystem_get_sn(subsystem), sizeof(cdata->sn), ' '); + cdata->kas = 10; + + cdata->rab = 6; + cdata->cmic.multi_port = 1; + cdata->cmic.multi_host = 1; + cdata->oaes.ns_attribute_notices = 1; + cdata->ctratt.host_id_exhid_supported = 1; + cdata->aerl = 0; + cdata->frmw.slot1_ro = 1; + cdata->frmw.num_slots = 1; + + cdata->lpa.celp = 1; /* Command Effects log page supported */ + + cdata->sqes.min = 6; + cdata->sqes.max = 6; + cdata->cqes.min = 4; + cdata->cqes.max = 4; + cdata->nn = subsystem->max_nsid; + cdata->vwc.present = 1; + cdata->vwc.flush_broadcast = SPDK_NVME_FLUSH_BROADCAST_NOT_SUPPORTED; + + cdata->nvmf_specific.ioccsz = sizeof(struct spdk_nvme_cmd) / 16; + cdata->nvmf_specific.iorcsz = sizeof(struct spdk_nvme_cpl) / 16; + cdata->nvmf_specific.icdoff = 0; /* offset starts directly after SQE */ + cdata->nvmf_specific.ctrattr.ctrlr_model = SPDK_NVMF_CTRLR_MODEL_DYNAMIC; + cdata->nvmf_specific.msdbd = 1; /* target supports single SGL in capsule */ + + /* TODO: this should be set by the transport */ + cdata->nvmf_specific.ioccsz += transport->opts.in_capsule_data_size / 16; + + cdata->oncs.dsm = spdk_nvmf_ctrlr_dsm_supported(ctrlr); + cdata->oncs.write_zeroes = spdk_nvmf_ctrlr_write_zeroes_supported(ctrlr); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ioccsz 0x%x\n", + cdata->nvmf_specific.ioccsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: iorcsz 0x%x\n", + cdata->nvmf_specific.iorcsz); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: icdoff 0x%x\n", + cdata->nvmf_specific.icdoff); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: ctrattr 0x%x\n", + *(uint8_t *)&cdata->nvmf_specific.ctrattr); + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "ext ctrlr data: msdbd 0x%x\n", + cdata->nvmf_specific.msdbd); + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify_active_ns_list(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + struct spdk_nvme_ns_list *ns_list) +{ + struct spdk_nvmf_ns *ns; + uint32_t count = 0; + + if (cmd->nsid >= 0xfffffffeUL) { + SPDK_ERRLOG("Identify Active Namespace List with invalid NSID %u\n", cmd->nsid); + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->opts.nsid <= cmd->nsid) { + continue; + } + + ns_list->ns_list[count++] = ns->opts.nsid; + if (count == SPDK_COUNTOF(ns_list->ns_list)) { + break; + } + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static void +_add_ns_id_desc(void **buf_ptr, size_t *buf_remain, + enum spdk_nvme_nidt type, + const void *data, size_t data_size) +{ + struct spdk_nvme_ns_id_desc *desc; + size_t desc_size = sizeof(*desc) + data_size; + + /* + * These should never fail in practice, since all valid NS ID descriptors + * should be defined so that they fit in the available 4096-byte buffer. + */ + assert(data_size > 0); + assert(data_size <= UINT8_MAX); + assert(desc_size < *buf_remain); + if (data_size == 0 || data_size > UINT8_MAX || desc_size > *buf_remain) { + return; + } + + desc = *buf_ptr; + desc->nidt = type; + desc->nidl = data_size; + memcpy(desc->nid, data, data_size); + + *buf_ptr += desc_size; + *buf_remain -= desc_size; +} + +static int +spdk_nvmf_ctrlr_identify_ns_id_descriptor_list( + struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_cmd *cmd, + struct spdk_nvme_cpl *rsp, + void *id_desc_list, size_t id_desc_list_size) +{ + struct spdk_nvmf_ns *ns; + size_t buf_remain = id_desc_list_size; + void *buf_ptr = id_desc_list; + + ns = _spdk_nvmf_subsystem_get_ns(subsystem, cmd->nsid); + if (ns == NULL || ns->bdev == NULL) { + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + +#define ADD_ID_DESC(type, data, size) \ + do { \ + if (!spdk_mem_all_zero(data, size)) { \ + _add_ns_id_desc(&buf_ptr, &buf_remain, type, data, size); \ + } \ + } while (0) + + ADD_ID_DESC(SPDK_NVME_NIDT_EUI64, ns->opts.eui64, sizeof(ns->opts.eui64)); + ADD_ID_DESC(SPDK_NVME_NIDT_NGUID, ns->opts.nguid, sizeof(ns->opts.nguid)); + ADD_ID_DESC(SPDK_NVME_NIDT_UUID, &ns->opts.uuid, sizeof(ns->opts.uuid)); + + /* + * The list is automatically 0-terminated because controller to host buffers in + * admin commands always get zeroed in spdk_nvmf_ctrlr_process_admin_cmd(). + */ + +#undef ADD_ID_DESC + + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_identify(struct spdk_nvmf_request *req) +{ + uint8_t cns; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_subsystem *subsystem = ctrlr->subsys; + + if (req->data == NULL || req->length < 4096) { + SPDK_ERRLOG("identify command with invalid buffer\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + cns = cmd->cdw10 & 0xFF; + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY && + cns != SPDK_NVME_IDENTIFY_CTRLR) { + /* Discovery controllers only support Identify Controller */ + goto invalid_cns; + } + + switch (cns) { + case SPDK_NVME_IDENTIFY_NS: + return spdk_nvmf_ctrlr_identify_ns(ctrlr, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_CTRLR: + return spdk_nvmf_ctrlr_identify_ctrlr(ctrlr, req->data); + case SPDK_NVME_IDENTIFY_ACTIVE_NS_LIST: + return spdk_nvmf_ctrlr_identify_active_ns_list(subsystem, cmd, rsp, req->data); + case SPDK_NVME_IDENTIFY_NS_ID_DESCRIPTOR_LIST: + return spdk_nvmf_ctrlr_identify_ns_id_descriptor_list(subsystem, cmd, rsp, req->data, req->length); + default: + goto invalid_cns; + } + +invalid_cns: + SPDK_ERRLOG("Identify command with unsupported CNS 0x%02x\n", cns); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + + +static struct spdk_nvmf_request * +spdk_nvmf_qpair_abort(struct spdk_nvmf_qpair *qpair, uint16_t cid) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + struct spdk_nvmf_request *req; + + if (spdk_nvmf_qpair_is_admin_queue(qpair)) { + if (ctrlr->aer_req && ctrlr->aer_req->cmd->nvme_cmd.cid == cid) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Aborting AER request\n"); + req = ctrlr->aer_req; + ctrlr->aer_req = NULL; + return req; + } + } + + /* TODO: track list of outstanding requests in qpair? */ + return NULL; +} + +static void +spdk_nvmf_ctrlr_abort_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + + spdk_nvmf_request_complete(req); +} + +static void +spdk_nvmf_ctrlr_abort_on_pg(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_request *req = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + uint16_t sqid = cmd->cdw10 & 0xFFFFu; + struct spdk_nvmf_qpair *qpair; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr == req->qpair->ctrlr && qpair->qid == sqid) { + struct spdk_nvmf_request *req_to_abort; + uint16_t cid = cmd->cdw10 >> 16; + + /* Found the qpair */ + + req_to_abort = spdk_nvmf_qpair_abort(qpair, cid); + if (req_to_abort == NULL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "cid %u not found\n", cid); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_FIELD; + spdk_for_each_channel_continue(i, -EINVAL); + return; + } + + /* Complete the request with aborted status */ + req_to_abort->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req_to_abort->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; + spdk_nvmf_request_complete(req_to_abort); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "abort ctrlr=%p req=%p sqid=%u cid=%u successful\n", + qpair->ctrlr, req_to_abort, sqid, cid); + rsp->cdw0 = 0; /* Command successfully aborted */ + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_SUCCESS; + /* Return -1 for the status so the iteration across threads stops. */ + spdk_for_each_channel_continue(i, -1); + + } + } + + spdk_for_each_channel_continue(i, 0); +} + +static int +spdk_nvmf_ctrlr_abort(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = 1; /* Command not aborted */ + rsp->status.sct = SPDK_NVME_SCT_COMMAND_SPECIFIC; + rsp->status.sc = SPDK_NVME_SC_INVALID_QUEUE_IDENTIFIER; + + /* Send a message to each poll group, searching for this ctrlr, sqid, and command. */ + spdk_for_each_channel(req->qpair->ctrlr->subsys->tgt, + spdk_nvmf_ctrlr_abort_on_pg, + req, + spdk_nvmf_ctrlr_abort_done + ); + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +get_features_generic(struct spdk_nvmf_request *req, uint32_t cdw0) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = cdw0; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +spdk_nvmf_ctrlr_get_features(struct spdk_nvmf_request *req) +{ + uint8_t feature; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + feature = cmd->cdw10 & 0xff; /* mask out the FID value */ + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return get_features_generic(req, ctrlr->feat.arbitration.raw); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return get_features_generic(req, ctrlr->feat.power_management.raw); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return spdk_nvmf_ctrlr_get_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return get_features_generic(req, ctrlr->feat.error_recovery.raw); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return get_features_generic(req, ctrlr->feat.volatile_write_cache.raw); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return get_features_generic(req, ctrlr->feat.number_of_queues.raw); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return get_features_generic(req, ctrlr->feat.write_atomicity.raw); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return get_features_generic(req, ctrlr->feat.async_event_configuration.raw); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return get_features_generic(req, ctrlr->feat.keep_alive_timer.raw); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return spdk_nvmf_ctrlr_get_features_host_identifier(req); + default: + SPDK_ERRLOG("Get Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +spdk_nvmf_ctrlr_set_features(struct spdk_nvmf_request *req) +{ + uint8_t feature; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + feature = cmd->cdw10 & 0xff; /* mask out the FID value */ + switch (feature) { + case SPDK_NVME_FEAT_ARBITRATION: + return spdk_nvmf_ctrlr_set_features_arbitration(req); + case SPDK_NVME_FEAT_POWER_MANAGEMENT: + return spdk_nvmf_ctrlr_set_features_power_management(req); + case SPDK_NVME_FEAT_TEMPERATURE_THRESHOLD: + return spdk_nvmf_ctrlr_set_features_temperature_threshold(req); + case SPDK_NVME_FEAT_ERROR_RECOVERY: + return spdk_nvmf_ctrlr_set_features_error_recovery(req); + case SPDK_NVME_FEAT_VOLATILE_WRITE_CACHE: + return spdk_nvmf_ctrlr_set_features_volatile_write_cache(req); + case SPDK_NVME_FEAT_NUMBER_OF_QUEUES: + return spdk_nvmf_ctrlr_set_features_number_of_queues(req); + case SPDK_NVME_FEAT_WRITE_ATOMICITY: + return spdk_nvmf_ctrlr_set_features_write_atomicity(req); + case SPDK_NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + return spdk_nvmf_ctrlr_set_features_async_event_configuration(req); + case SPDK_NVME_FEAT_KEEP_ALIVE_TIMER: + return spdk_nvmf_ctrlr_set_features_keep_alive_timer(req); + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + return spdk_nvmf_ctrlr_set_features_host_identifier(req); + default: + SPDK_ERRLOG("Set Features command with unsupported feature ID 0x%02x\n", feature); + response->status.sc = SPDK_NVME_SC_INVALID_FIELD; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +static int +spdk_nvmf_ctrlr_keep_alive(struct spdk_nvmf_request *req) +{ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Keep Alive\n"); + /* + * To handle keep alive just clear or reset the + * ctrlr based keep alive duration counter. + * When added, a separate timer based process + * will monitor if the time since last recorded + * keep alive has exceeded the max duration and + * take appropriate action. + */ + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +spdk_nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + if (ctrlr == NULL) { + SPDK_ERRLOG("Admin command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (ctrlr->vcprop.cc.bits.en != 1) { + SPDK_ERRLOG("Admin command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (req->data && spdk_nvme_opc_get_data_transfer(cmd->opc) == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + memset(req->data, 0, req->length); + } + + if (ctrlr->subsys->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + /* Discovery controllers only support Get Log Page and Identify */ + switch (cmd->opc) { + case SPDK_NVME_OPC_IDENTIFY: + case SPDK_NVME_OPC_GET_LOG_PAGE: + break; + default: + goto invalid_opcode; + } + } + + switch (cmd->opc) { + case SPDK_NVME_OPC_GET_LOG_PAGE: + return spdk_nvmf_ctrlr_get_log_page(req); + case SPDK_NVME_OPC_IDENTIFY: + return spdk_nvmf_ctrlr_identify(req); + case SPDK_NVME_OPC_ABORT: + return spdk_nvmf_ctrlr_abort(req); + case SPDK_NVME_OPC_GET_FEATURES: + return spdk_nvmf_ctrlr_get_features(req); + case SPDK_NVME_OPC_SET_FEATURES: + return spdk_nvmf_ctrlr_set_features(req); + case SPDK_NVME_OPC_ASYNC_EVENT_REQUEST: + return spdk_nvmf_ctrlr_async_event_request(req); + case SPDK_NVME_OPC_KEEP_ALIVE: + return spdk_nvmf_ctrlr_keep_alive(req); + + case SPDK_NVME_OPC_CREATE_IO_SQ: + case SPDK_NVME_OPC_CREATE_IO_CQ: + case SPDK_NVME_OPC_DELETE_IO_SQ: + case SPDK_NVME_OPC_DELETE_IO_CQ: + /* Create and Delete I/O CQ/SQ not allowed in NVMe-oF */ + goto invalid_opcode; + + default: + goto invalid_opcode; + } + +invalid_opcode: + SPDK_ERRLOG("Unsupported admin opcode 0x%x\n", cmd->opc); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +int +spdk_nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + struct spdk_nvmf_capsule_cmd *cap_hdr; + + cap_hdr = &req->cmd->nvmf_cmd; + + if (qpair->ctrlr == NULL) { + /* No ctrlr established yet; the only valid command is Connect */ + if (cap_hdr->fctype == SPDK_NVMF_FABRIC_COMMAND_CONNECT) { + return spdk_nvmf_ctrlr_connect(req); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Got fctype 0x%x, expected Connect\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else if (spdk_nvmf_qpair_is_admin_queue(qpair)) { + /* + * Controller session is established, and this is an admin queue. + * Disallow Connect and allow other fabrics commands. + */ + switch (cap_hdr->fctype) { + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_SET: + return spdk_nvmf_property_set(req); + case SPDK_NVMF_FABRIC_COMMAND_PROPERTY_GET: + return spdk_nvmf_property_get(req); + default: + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "unknown fctype 0x%02x\n", + cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + } else { + /* Controller session is established, and this is an I/O queue */ + /* For now, no I/O-specific Fabrics commands are implemented (other than Connect) */ + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Unexpected I/O fctype 0x%x\n", cap_hdr->fctype); + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } +} + +int +spdk_nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr) +{ + struct spdk_nvmf_request *req; + struct spdk_nvme_cpl *rsp; + union spdk_nvme_async_event_completion event = {0}; + + /* Users may disable the event notification */ + if (!ctrlr->feat.async_event_configuration.bits.ns_attr_notice) { + return 0; + } + + event.bits.async_event_type = SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE; + event.bits.async_event_info = SPDK_NVME_ASYNC_EVENT_NS_ATTR_CHANGED; + event.bits.log_page_identifier = SPDK_NVME_LOG_CHANGED_NS_LIST; + + /* If there is no outstanding AER request, queue the event. Then + * if an AER is later submitted, this event can be sent as a + * response. + */ + if (!ctrlr->aer_req) { + if (ctrlr->notice_event.bits.async_event_type == + SPDK_NVME_ASYNC_EVENT_TYPE_NOTICE) { + return 0; + } + + ctrlr->notice_event.raw = event.raw; + return 0; + } + + req = ctrlr->aer_req; + rsp = &req->rsp->nvme_cpl; + + rsp->cdw0 = event.raw; + + spdk_nvmf_request_complete(req); + ctrlr->aer_req = NULL; + + return 0; +} + +void +spdk_nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + if (!spdk_nvmf_qpair_is_admin_queue(qpair)) { + return; + } + + if (ctrlr->aer_req != NULL) { + spdk_nvmf_request_free(ctrlr->aer_req); + ctrlr->aer_req = NULL; + } +} + +void +spdk_nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr) +{ + if (!ctrlr->aer_req) { + return; + } + + spdk_nvmf_request_complete(ctrlr->aer_req); + ctrlr->aer_req = NULL; +} diff --git a/src/spdk/lib/nvmf/ctrlr_bdev.c b/src/spdk/lib/nvmf/ctrlr_bdev.c new file mode 100644 index 00000000..7eb4f19a --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_bdev.c @@ -0,0 +1,531 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" + +#include "spdk/bdev.h" +#include "spdk/endian.h" +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/trace.h" +#include "spdk/scsi_spec.h" +#include "spdk/string.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +static bool +spdk_nvmf_subsystem_bdev_io_type_supported(struct spdk_nvmf_subsystem *subsystem, + enum spdk_bdev_io_type io_type) +{ + struct spdk_nvmf_ns *ns; + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + if (ns->bdev == NULL) { + continue; + } + + if (!spdk_bdev_io_type_supported(ns->bdev, io_type)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "Subsystem %s namespace %u (%s) does not support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + ns->opts.nsid, spdk_bdev_get_name(ns->bdev), (int)io_type); + return false; + } + } + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "All devices in Subsystem %s support io_type %d\n", + spdk_nvmf_subsystem_get_nqn(subsystem), (int)io_type); + return true; +} + +bool +spdk_nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return spdk_nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_UNMAP); +} + +bool +spdk_nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr) +{ + return spdk_nvmf_subsystem_bdev_io_type_supported(ctrlr->subsys, SPDK_BDEV_IO_TYPE_WRITE_ZEROES); +} + +static void +nvmf_bdev_ctrlr_complete_cmd(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct spdk_nvmf_request *req = cb_arg; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int sc, sct; + + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + response->status.sc = sc; + response->status.sct = sct; + + spdk_nvmf_request_complete(req); + spdk_bdev_free_io(bdev_io); +} + +void +spdk_nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata) +{ + struct spdk_bdev *bdev = ns->bdev; + uint64_t num_blocks; + + num_blocks = spdk_bdev_get_num_blocks(bdev); + + nsdata->nsze = num_blocks; + nsdata->ncap = num_blocks; + nsdata->nuse = num_blocks; + nsdata->nlbaf = 0; + nsdata->flbas.format = 0; + nsdata->lbaf[0].lbads = spdk_u32log2(spdk_bdev_get_block_size(bdev)); + nsdata->noiob = spdk_bdev_get_optimal_io_boundary(bdev); + nsdata->nmic.can_share = 1; + + SPDK_STATIC_ASSERT(sizeof(nsdata->nguid) == sizeof(ns->opts.nguid), "size mismatch"); + memcpy(nsdata->nguid, ns->opts.nguid, sizeof(nsdata->nguid)); + + SPDK_STATIC_ASSERT(sizeof(nsdata->eui64) == sizeof(ns->opts.eui64), "size mismatch"); + memcpy(&nsdata->eui64, ns->opts.eui64, sizeof(nsdata->eui64)); +} + +static void +nvmf_bdev_ctrlr_get_rw_params(const struct spdk_nvme_cmd *cmd, uint64_t *start_lba, + uint64_t *num_blocks) +{ + /* SLBA: CDW10 and CDW11 */ + *start_lba = from_le64(&cmd->cdw10); + + /* NLB: CDW12 bits 15:00, 0's based */ + *num_blocks = (from_le32(&cmd->cdw12) & 0xFFFFu) + 1; +} + +static bool +nvmf_bdev_ctrlr_lba_in_range(uint64_t bdev_num_blocks, uint64_t io_start_lba, + uint64_t io_num_blocks) +{ + if (io_start_lba + io_num_blocks > bdev_num_blocks || + io_start_lba + io_num_blocks < io_start_lba) { + return false; + } + + return true; +} + +static void +spdk_nvmf_ctrlr_process_io_cmd_resubmit(void *arg) +{ + struct spdk_nvmf_request *req = arg; + + spdk_nvmf_ctrlr_process_io_cmd(req); +} + +static void +nvmf_bdev_ctrl_queue_io(struct spdk_nvmf_request *req, struct spdk_bdev *bdev, + struct spdk_io_channel *ch, spdk_bdev_io_wait_cb cb_fn, void *cb_arg) +{ + int rc; + + req->bdev_io_wait.bdev = bdev; + req->bdev_io_wait.cb_fn = cb_fn; + req->bdev_io_wait.cb_arg = cb_arg; + + rc = spdk_bdev_queue_io_wait(bdev, ch, &req->bdev_io_wait); + if (rc != 0) { + assert(false); + } +} + +static int +nvmf_bdev_ctrlr_read_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Read NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_readv_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_write_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + uint32_t block_size = spdk_bdev_get_block_size(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(num_blocks * block_size > req->length)) { + SPDK_ERRLOG("Write NLB %" PRIu64 " * block size %" PRIu32 " > SGL length %" PRIu32 "\n", + num_blocks, block_size, req->length); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_writev_blocks(desc, ch, req->iov, req->iovcnt, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_write_zeroes_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + uint64_t bdev_num_blocks = spdk_bdev_get_num_blocks(bdev); + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + uint64_t start_lba; + uint64_t num_blocks; + int rc; + + nvmf_bdev_ctrlr_get_rw_params(cmd, &start_lba, &num_blocks); + + if (spdk_unlikely(!nvmf_bdev_ctrlr_lba_in_range(bdev_num_blocks, start_lba, num_blocks))) { + SPDK_ERRLOG("end of media\n"); + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_LBA_OUT_OF_RANGE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_write_zeroes_blocks(desc, ch, start_lba, num_blocks, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + rsp->status.sct = SPDK_NVME_SCT_GENERIC; + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +static int +nvmf_bdev_ctrlr_flush_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + /* As for NVMeoF controller, SPDK always set volatile write + * cache bit to 1, return success for those block devices + * which can't support FLUSH command. + */ + if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH)) { + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + rc = spdk_bdev_flush_blocks(desc, ch, 0, spdk_bdev_get_num_blocks(bdev), + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +struct nvmf_virtual_ctrlr_unmap { + struct spdk_nvmf_request *req; + uint32_t count; + struct spdk_bdev_desc *desc; + struct spdk_bdev *bdev; + struct spdk_io_channel *ch; +}; + +static void +nvmf_virtual_ctrlr_dsm_cpl(struct spdk_bdev_io *bdev_io, bool success, + void *cb_arg) +{ + struct nvmf_virtual_ctrlr_unmap *unmap_ctx = cb_arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int sc, sct; + + unmap_ctx->count--; + + if (response->status.sct == SPDK_NVME_SCT_GENERIC && + response->status.sc == SPDK_NVME_SC_SUCCESS) { + spdk_bdev_io_get_nvme_status(bdev_io, &sct, &sc); + response->status.sc = sc; + response->status.sct = sct; + } + + if (unmap_ctx->count == 0) { + spdk_nvmf_request_complete(req); + free(unmap_ctx); + } + spdk_bdev_free_io(bdev_io); +} + +static int +nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_virtual_ctrlr_unmap *unmap_ctx); +static void +nvmf_bdev_ctrlr_dsm_cmd_resubmit(void *arg) +{ + struct nvmf_virtual_ctrlr_unmap *unmap_ctx = arg; + struct spdk_nvmf_request *req = unmap_ctx->req; + struct spdk_bdev_desc *desc = unmap_ctx->desc; + struct spdk_bdev *bdev = unmap_ctx->bdev; + struct spdk_io_channel *ch = unmap_ctx->ch; + + nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req, unmap_ctx); +} + +static int +nvmf_bdev_ctrlr_dsm_cmd(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req, + struct nvmf_virtual_ctrlr_unmap *unmap_ctx) +{ + uint32_t attribute; + uint16_t nr, i; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + int rc; + + nr = ((cmd->cdw10 & 0x000000ff) + 1); + if (nr * sizeof(struct spdk_nvme_dsm_range) > req->length) { + SPDK_ERRLOG("Dataset Management number of ranges > SGL length\n"); + response->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + attribute = cmd->cdw11 & 0x00000007; + if (attribute & SPDK_NVME_DSM_ATTR_DEALLOCATE) { + struct spdk_nvme_dsm_range *dsm_range; + uint64_t lba; + uint32_t lba_count; + + if (unmap_ctx == NULL) { + unmap_ctx = calloc(1, sizeof(*unmap_ctx)); + if (!unmap_ctx) { + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + unmap_ctx->req = req; + unmap_ctx->desc = desc; + unmap_ctx->ch = ch; + } + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + + dsm_range = (struct spdk_nvme_dsm_range *)req->data; + for (i = unmap_ctx->count; i < nr; i++) { + lba = dsm_range[i].starting_lba; + lba_count = dsm_range[i].length; + + unmap_ctx->count++; + + rc = spdk_bdev_unmap_blocks(desc, ch, lba, lba_count, + nvmf_virtual_ctrlr_dsm_cpl, unmap_ctx); + if (rc) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, nvmf_bdev_ctrlr_dsm_cmd_resubmit, unmap_ctx); + /* Unmap was not yet submitted to bdev */ + unmap_ctx->count--; + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + response->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + unmap_ctx->count--; + /* We can't return here - we may have to wait for any other + * unmaps already sent to complete */ + break; + } + } + + if (unmap_ctx->count == 0) { + free(unmap_ctx); + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_SUCCESS; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; +} + +static int +nvmf_bdev_ctrlr_nvme_passthru_io(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, + struct spdk_io_channel *ch, struct spdk_nvmf_request *req) +{ + int rc; + + rc = spdk_bdev_nvme_io_passthru(desc, ch, &req->cmd->nvme_cmd, req->data, req->length, + nvmf_bdev_ctrlr_complete_cmd, req); + if (spdk_unlikely(rc)) { + if (rc == -ENOMEM) { + nvmf_bdev_ctrl_queue_io(req, bdev, ch, spdk_nvmf_ctrlr_process_io_cmd_resubmit, req); + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; + } + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_INVALID_OPCODE; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + return SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS; +} + +int +spdk_nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req) +{ + uint32_t nsid; + struct spdk_nvmf_ns *ns; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group = req->qpair->group; + struct spdk_nvmf_ctrlr *ctrlr = req->qpair->ctrlr; + struct spdk_nvme_cmd *cmd = &req->cmd->nvme_cmd; + struct spdk_nvme_cpl *response = &req->rsp->nvme_cpl; + + /* pre-set response details for this command */ + response->status.sc = SPDK_NVME_SC_SUCCESS; + nsid = cmd->nsid; + + if (spdk_unlikely(ctrlr == NULL)) { + SPDK_ERRLOG("I/O command sent before CONNECT\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + if (spdk_unlikely(ctrlr->vcprop.cc.bits.en != 1)) { + SPDK_ERRLOG("I/O command sent to disabled controller\n"); + response->status.sct = SPDK_NVME_SCT_GENERIC; + response->status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + ns = _spdk_nvmf_subsystem_get_ns(ctrlr->subsys, nsid); + if (ns == NULL || ns->bdev == NULL) { + SPDK_ERRLOG("Unsuccessful query for nsid %u\n", cmd->nsid); + response->status.sc = SPDK_NVME_SC_INVALID_NAMESPACE_OR_FORMAT; + response->status.dnr = 1; + return SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE; + } + + bdev = ns->bdev; + desc = ns->desc; + ch = group->sgroups[ctrlr->subsys->id].channels[nsid - 1]; + switch (cmd->opc) { + case SPDK_NVME_OPC_READ: + return nvmf_bdev_ctrlr_read_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE: + return nvmf_bdev_ctrlr_write_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_WRITE_ZEROES: + return nvmf_bdev_ctrlr_write_zeroes_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_FLUSH: + return nvmf_bdev_ctrlr_flush_cmd(bdev, desc, ch, req); + case SPDK_NVME_OPC_DATASET_MANAGEMENT: + return nvmf_bdev_ctrlr_dsm_cmd(bdev, desc, ch, req, NULL); + default: + return nvmf_bdev_ctrlr_nvme_passthru_io(bdev, desc, ch, req); + } +} diff --git a/src/spdk/lib/nvmf/ctrlr_discovery.c b/src/spdk/lib/nvmf/ctrlr_discovery.c new file mode 100644 index 00000000..305a6076 --- /dev/null +++ b/src/spdk/lib/nvmf/ctrlr_discovery.c @@ -0,0 +1,144 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * NVMe over Fabrics discovery service + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/event.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" + +static void +nvmf_update_discovery_log(struct spdk_nvmf_tgt *tgt) +{ + uint64_t numrec = 0; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_listener *listener; + struct spdk_nvmf_discovery_log_page_entry *entry; + struct spdk_nvmf_discovery_log_page *disc_log; + size_t cur_size; + uint32_t sid; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Generating log page for genctr %" PRIu64 "\n", + tgt->discovery_genctr); + + cur_size = sizeof(struct spdk_nvmf_discovery_log_page); + disc_log = calloc(1, cur_size); + if (disc_log == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + return; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem == NULL) { + continue; + } + + if (subsystem->subtype == SPDK_NVMF_SUBTYPE_DISCOVERY) { + continue; + } + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + size_t new_size = cur_size + sizeof(*entry); + void *new_log_page = realloc(disc_log, new_size); + + if (new_log_page == NULL) { + SPDK_ERRLOG("Discovery log page memory allocation error\n"); + break; + } + + disc_log = new_log_page; + cur_size = new_size; + + entry = &disc_log->entries[numrec]; + memset(entry, 0, sizeof(*entry)); + entry->portid = numrec; + entry->cntlid = 0xffff; + entry->asqsz = listener->transport->opts.max_aq_depth; + entry->subtype = subsystem->subtype; + snprintf(entry->subnqn, sizeof(entry->subnqn), "%s", subsystem->subnqn); + + spdk_nvmf_transport_listener_discover(listener->transport, &listener->trid, entry); + + numrec++; + } + } + + disc_log->numrec = numrec; + disc_log->genctr = tgt->discovery_genctr; + + free(tgt->discovery_log_page); + + tgt->discovery_log_page = disc_log; + tgt->discovery_log_page_size = cur_size; +} + +void +spdk_nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, void *buffer, + uint64_t offset, uint32_t length) +{ + size_t copy_len = 0; + size_t zero_len = length; + + if (tgt->discovery_log_page == NULL || + tgt->discovery_log_page->genctr != tgt->discovery_genctr) { + nvmf_update_discovery_log(tgt); + } + + /* Copy the valid part of the discovery log page, if any */ + if (tgt->discovery_log_page && offset < tgt->discovery_log_page_size) { + copy_len = spdk_min(tgt->discovery_log_page_size - offset, length); + zero_len -= copy_len; + memcpy(buffer, (char *)tgt->discovery_log_page + offset, copy_len); + } + + /* Zero out the rest of the buffer */ + if (zero_len) { + memset((char *)buffer + copy_len, 0, zero_len); + } + + /* We should have copied or zeroed every byte of the output buffer. */ + assert(copy_len + zero_len == length); +} diff --git a/src/spdk/lib/nvmf/nvmf.c b/src/spdk/lib/nvmf/nvmf.c new file mode 100644 index 00000000..32539f53 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf.c @@ -0,0 +1,1173 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "spdk/bdev.h" +#include "spdk/bit_array.h" +#include "spdk/conf.h" +#include "spdk/thread.h" +#include "spdk/nvmf.h" +#include "spdk/trace.h" +#include "spdk/endian.h" +#include "spdk/string.h" + +#include "spdk_internal/log.h" + +#include "nvmf_internal.h" +#include "transport.h" + +SPDK_LOG_REGISTER_COMPONENT("nvmf", SPDK_LOG_NVMF) + +#define SPDK_NVMF_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 +#define SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS 1024 +#define SPDK_NVMF_DEFAULT_IO_UNIT_SIZE 131072 + +typedef void (*nvmf_qpair_disconnect_cpl)(void *ctx, int status); +static void spdk_nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf); + +/* supplied to a single call to nvmf_qpair_disconnect */ +struct nvmf_qpair_disconnect_ctx { + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_ctrlr *ctrlr; + nvmf_qpair_disconnect_cb cb_fn; + struct spdk_thread *thread; + void *ctx; + uint16_t qid; +}; + +/* + * There are several times when we need to iterate through the list of all qpairs and selectively delete them. + * In order to do this sequentially without overlap, we must provide a context to recover the next qpair from + * to enable calling nvmf_qpair_disconnect on the next desired qpair. + */ +struct nvmf_qpair_disconnect_many_ctx { + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + spdk_nvmf_poll_group_mod_done cpl_fn; + void *cpl_ctx; +}; + +static void +spdk_nvmf_qpair_set_state(struct spdk_nvmf_qpair *qpair, + enum spdk_nvmf_qpair_state state) +{ + assert(qpair != NULL); + assert(qpair->group->thread == spdk_get_thread()); + + qpair->state = state; +} + +void +spdk_nvmf_tgt_opts_init(struct spdk_nvmf_tgt_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_DEFAULT_MAX_IO_SIZE; + opts->max_subsystems = SPDK_NVMF_DEFAULT_MAX_SUBSYSTEMS; + opts->io_unit_size = SPDK_NVMF_DEFAULT_IO_UNIT_SIZE; +} + +static int +spdk_nvmf_poll_group_poll(void *ctx) +{ + struct spdk_nvmf_poll_group *group = ctx; + int rc; + int count = 0; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + rc = spdk_nvmf_transport_poll_group_poll(tgroup); + if (rc < 0) { + return -1; + } + count += rc; + } + + return count; +} + +static int +spdk_nvmf_tgt_create_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport *transport; + uint32_t sid; + + TAILQ_INIT(&group->tgroups); + TAILQ_INIT(&group->qpairs); + + TAILQ_FOREACH(transport, &tgt->transports, link) { + spdk_nvmf_poll_group_add_transport(group, transport); + } + + group->num_sgroups = tgt->opts.max_subsystems; + group->sgroups = calloc(tgt->opts.max_subsystems, sizeof(struct spdk_nvmf_subsystem_poll_group)); + if (!group->sgroups) { + return -1; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + struct spdk_nvmf_subsystem *subsystem; + + subsystem = tgt->subsystems[sid]; + if (!subsystem) { + continue; + } + + if (spdk_nvmf_poll_group_add_subsystem(group, subsystem, NULL, NULL) != 0) { + spdk_nvmf_tgt_destroy_poll_group(io_device, ctx_buf); + return -1; + } + } + + group->poller = spdk_poller_register(spdk_nvmf_poll_group_poll, group, 0); + group->thread = spdk_get_thread(); + + return 0; +} + +static void +spdk_nvmf_tgt_destroy_poll_group(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_poll_group *group = ctx_buf; + struct spdk_nvmf_transport_poll_group *tgroup, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t sid, nsid; + + TAILQ_FOREACH_SAFE(tgroup, &group->tgroups, link, tmp) { + TAILQ_REMOVE(&group->tgroups, tgroup, link); + spdk_nvmf_transport_poll_group_destroy(tgroup); + } + + for (sid = 0; sid < group->num_sgroups; sid++) { + sgroup = &group->sgroups[sid]; + + for (nsid = 0; nsid < sgroup->num_channels; nsid++) { + if (sgroup->channels[nsid]) { + spdk_put_io_channel(sgroup->channels[nsid]); + sgroup->channels[nsid] = NULL; + } + } + + free(sgroup->channels); + } + + free(group->sgroups); +} + +static void +_nvmf_tgt_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_poll_group *group = qpair_ctx->group; + struct spdk_io_channel *ch; + int rc = 0; + + qpair = TAILQ_FIRST(&group->qpairs); + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_tgt_disconnect_next_qpair, ctx); + } + + if (!qpair || rc != 0) { + /* When the refcount from the channels reaches 0, spdk_nvmf_tgt_destroy_poll_group will be called. */ + ch = spdk_io_channel_from_ctx(group); + spdk_put_io_channel(ch); + free(qpair_ctx); + } +} + +static void +spdk_nvmf_tgt_destroy_poll_group_qpairs(struct spdk_nvmf_poll_group *group) +{ + struct nvmf_qpair_disconnect_many_ctx *ctx; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Failed to allocate memory for destroy poll group ctx\n"); + return; + } + + spdk_poller_unregister(&group->poller); + + ctx->group = group; + _nvmf_tgt_disconnect_next_qpair(ctx); +} + +struct spdk_nvmf_tgt * +spdk_nvmf_tgt_create(struct spdk_nvmf_tgt_opts *opts) +{ + struct spdk_nvmf_tgt *tgt; + + tgt = calloc(1, sizeof(*tgt)); + if (!tgt) { + return NULL; + } + + if (!opts) { + spdk_nvmf_tgt_opts_init(&tgt->opts); + } else { + tgt->opts = *opts; + } + + tgt->discovery_genctr = 0; + tgt->discovery_log_page = NULL; + tgt->discovery_log_page_size = 0; + TAILQ_INIT(&tgt->transports); + + tgt->subsystems = calloc(tgt->opts.max_subsystems, sizeof(struct spdk_nvmf_subsystem *)); + if (!tgt->subsystems) { + free(tgt); + return NULL; + } + + spdk_io_device_register(tgt, + spdk_nvmf_tgt_create_poll_group, + spdk_nvmf_tgt_destroy_poll_group, + sizeof(struct spdk_nvmf_poll_group), + "nvmf_tgt"); + + return tgt; +} + +static void +spdk_nvmf_tgt_destroy_cb(void *io_device) +{ + struct spdk_nvmf_tgt *tgt = io_device; + struct spdk_nvmf_transport *transport, *transport_tmp; + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; + uint32_t i; + + if (tgt->discovery_log_page) { + free(tgt->discovery_log_page); + } + + if (tgt->subsystems) { + for (i = 0; i < tgt->opts.max_subsystems; i++) { + if (tgt->subsystems[i]) { + spdk_nvmf_subsystem_destroy(tgt->subsystems[i]); + } + } + free(tgt->subsystems); + } + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, transport_tmp) { + TAILQ_REMOVE(&tgt->transports, transport, link); + spdk_nvmf_transport_destroy(transport); + } + + destroy_cb_fn = tgt->destroy_cb_fn; + destroy_cb_arg = tgt->destroy_cb_arg; + + free(tgt); + + if (destroy_cb_fn) { + destroy_cb_fn(destroy_cb_arg, 0); + } +} + +void +spdk_nvmf_tgt_destroy(struct spdk_nvmf_tgt *tgt, + spdk_nvmf_tgt_destroy_done_fn cb_fn, + void *cb_arg) +{ + tgt->destroy_cb_fn = cb_fn; + tgt->destroy_cb_arg = cb_arg; + + spdk_io_device_unregister(tgt, spdk_nvmf_tgt_destroy_cb); +} + +static void +spdk_nvmf_write_subsystem_config_json(struct spdk_json_write_ctx *w, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_host *host; + struct spdk_nvmf_listener *listener; + const struct spdk_nvme_transport_id *trid; + struct spdk_nvmf_ns *ns; + struct spdk_nvmf_ns_opts ns_opts; + uint32_t max_namespaces; + char uuid_str[SPDK_UUID_STRING_LEN]; + const char *trtype; + const char *adrfam; + + if (spdk_nvmf_subsystem_get_type(subsystem) != SPDK_NVMF_SUBTYPE_NVME) { + return; + } + + /* { */ + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_create"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_bool(w, "allow_any_host", spdk_nvmf_subsystem_get_allow_any_host(subsystem)); + spdk_json_write_named_string(w, "serial_number", spdk_nvmf_subsystem_get_sn(subsystem)); + + max_namespaces = spdk_nvmf_subsystem_get_max_namespaces(subsystem); + if (max_namespaces != 0) { + spdk_json_write_named_uint32(w, "max_namespaces", max_namespaces); + } + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + + for (listener = spdk_nvmf_subsystem_get_first_listener(subsystem); listener != NULL; + listener = spdk_nvmf_subsystem_get_next_listener(subsystem, listener)) { + trid = spdk_nvmf_listener_get_trid(listener); + + trtype = spdk_nvme_transport_id_trtype_str(trid->trtype); + adrfam = spdk_nvme_transport_id_adrfam_str(trid->adrfam); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_listener"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "listen_address" : { */ + spdk_json_write_named_object_begin(w, "listen_address"); + + spdk_json_write_named_string(w, "trtype", trtype); + if (adrfam) { + spdk_json_write_named_string(w, "adrfam", adrfam); + } + + spdk_json_write_named_string(w, "traddr", trid->traddr); + spdk_json_write_named_string(w, "trsvcid", trid->trsvcid); + /* } "listen_address" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (host = spdk_nvmf_subsystem_get_first_host(subsystem); host != NULL; + host = spdk_nvmf_subsystem_get_next_host(subsystem, host)) { + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_host"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + spdk_json_write_named_string(w, "host", spdk_nvmf_host_get_nqn(host)); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } + + for (ns = spdk_nvmf_subsystem_get_first_ns(subsystem); ns != NULL; + ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns)) { + spdk_nvmf_ns_get_opts(ns, &ns_opts, sizeof(ns_opts)); + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_subsystem_add_ns"); + + /* "params" : { */ + spdk_json_write_named_object_begin(w, "params"); + + spdk_json_write_named_string(w, "nqn", spdk_nvmf_subsystem_get_nqn(subsystem)); + + /* "namespace" : { */ + spdk_json_write_named_object_begin(w, "namespace"); + + spdk_json_write_named_uint32(w, "nsid", spdk_nvmf_ns_get_id(ns)); + spdk_json_write_named_string(w, "bdev_name", spdk_bdev_get_name(spdk_nvmf_ns_get_bdev(ns))); + + if (!spdk_mem_all_zero(ns_opts.nguid, sizeof(ns_opts.nguid))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.nguid) == sizeof(uint64_t) * 2, "size mismatch"); + spdk_json_write_named_string_fmt(w, "nguid", "%016"PRIX64"%016"PRIX64, from_be64(&ns_opts.nguid[0]), + from_be64(&ns_opts.nguid[8])); + } + + if (!spdk_mem_all_zero(ns_opts.eui64, sizeof(ns_opts.eui64))) { + SPDK_STATIC_ASSERT(sizeof(ns_opts.eui64) == sizeof(uint64_t), "size mismatch"); + spdk_json_write_named_string_fmt(w, "eui64", "%016"PRIX64, from_be64(&ns_opts.eui64)); + } + + if (!spdk_mem_all_zero(&ns_opts.uuid, sizeof(ns_opts.uuid))) { + spdk_uuid_fmt_lower(uuid_str, sizeof(uuid_str), &ns_opts.uuid); + spdk_json_write_named_string(w, "uuid", uuid_str); + } + + /* "namespace" */ + spdk_json_write_object_end(w); + + /* } "params" */ + spdk_json_write_object_end(w); + + /* } */ + spdk_json_write_object_end(w); + } +} + +void +spdk_nvmf_tgt_write_config_json(struct spdk_json_write_ctx *w, struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_transport *transport; + + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "set_nvmf_target_options"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_uint32(w, "max_queue_depth", tgt->opts.max_queue_depth); + spdk_json_write_named_uint32(w, "max_qpairs_per_ctrlr", tgt->opts.max_qpairs_per_ctrlr); + spdk_json_write_named_uint32(w, "in_capsule_data_size", tgt->opts.in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", tgt->opts.max_io_size); + spdk_json_write_named_uint32(w, "max_subsystems", tgt->opts.max_subsystems); + spdk_json_write_named_uint32(w, "io_unit_size", tgt->opts.io_unit_size); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + + /* write transports */ + TAILQ_FOREACH(transport, &tgt->transports, link) { + spdk_json_write_object_begin(w); + spdk_json_write_named_string(w, "method", "nvmf_create_transport"); + + spdk_json_write_named_object_begin(w, "params"); + spdk_json_write_named_string(w, "trtype", spdk_nvme_transport_id_trtype_str(transport->ops->type)); + spdk_json_write_named_uint32(w, "max_queue_depth", transport->opts.max_queue_depth); + spdk_json_write_named_uint32(w, "max_qpairs_per_ctrlr", transport->opts.max_qpairs_per_ctrlr); + spdk_json_write_named_uint32(w, "in_capsule_data_size", transport->opts.in_capsule_data_size); + spdk_json_write_named_uint32(w, "max_io_size", transport->opts.max_io_size); + spdk_json_write_named_uint32(w, "io_unit_size", transport->opts.io_unit_size); + spdk_json_write_named_uint32(w, "max_aq_depth", transport->opts.max_aq_depth); + spdk_json_write_object_end(w); + + spdk_json_write_object_end(w); + } + + subsystem = spdk_nvmf_subsystem_get_first(tgt); + while (subsystem) { + spdk_nvmf_write_subsystem_config_json(w, subsystem); + subsystem = spdk_nvmf_subsystem_get_next(subsystem); + } +} + +void +spdk_nvmf_tgt_listen(struct spdk_nvmf_tgt *tgt, + struct spdk_nvme_transport_id *trid, + spdk_nvmf_tgt_listen_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_transport *transport; + int rc; + bool propagate = false; + + transport = spdk_nvmf_tgt_get_transport(tgt, trid->trtype); + if (!transport) { + struct spdk_nvmf_transport_opts opts; + + opts.max_queue_depth = tgt->opts.max_queue_depth; + opts.max_qpairs_per_ctrlr = tgt->opts.max_qpairs_per_ctrlr; + opts.in_capsule_data_size = tgt->opts.in_capsule_data_size; + opts.max_io_size = tgt->opts.max_io_size; + opts.io_unit_size = tgt->opts.io_unit_size; + /* use max_queue depth since tgt. opts. doesn't have max_aq_depth */ + opts.max_aq_depth = tgt->opts.max_queue_depth; + + transport = spdk_nvmf_transport_create(trid->trtype, &opts); + if (!transport) { + SPDK_ERRLOG("Transport initialization failed\n"); + cb_fn(cb_arg, -EINVAL); + return; + } + + propagate = true; + } + + rc = spdk_nvmf_transport_listen(transport, trid); + if (rc < 0) { + SPDK_ERRLOG("Unable to listen on address '%s'\n", trid->traddr); + cb_fn(cb_arg, rc); + return; + } + + tgt->discovery_genctr++; + + if (propagate) { + spdk_nvmf_tgt_add_transport(tgt, transport, cb_fn, cb_arg); + } else { + cb_fn(cb_arg, 0); + } +} + +struct spdk_nvmf_tgt_add_transport_ctx { + struct spdk_nvmf_tgt *tgt; + struct spdk_nvmf_transport *transport; + spdk_nvmf_tgt_add_transport_done_fn cb_fn; + void *cb_arg; +}; + +static void +_spdk_nvmf_tgt_add_transport_done(struct spdk_io_channel_iter *i, int status) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + ctx->cb_fn(ctx->cb_arg, status); + + free(ctx); +} + +static void +_spdk_nvmf_tgt_add_transport(struct spdk_io_channel_iter *i) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); + struct spdk_nvmf_poll_group *group = spdk_io_channel_get_ctx(ch); + int rc; + + rc = spdk_nvmf_poll_group_add_transport(group, ctx->transport); + spdk_for_each_channel_continue(i, rc); +} + +void spdk_nvmf_tgt_add_transport(struct spdk_nvmf_tgt *tgt, + struct spdk_nvmf_transport *transport, + spdk_nvmf_tgt_add_transport_done_fn cb_fn, + void *cb_arg) +{ + struct spdk_nvmf_tgt_add_transport_ctx *ctx; + + if (spdk_nvmf_tgt_get_transport(tgt, transport->ops->type)) { + cb_fn(cb_arg, -EEXIST); + return; /* transport already created */ + } + + transport->tgt = tgt; + TAILQ_INSERT_TAIL(&tgt->transports, transport, link); + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + cb_fn(cb_arg, -ENOMEM); + return; + } + + ctx->tgt = tgt; + ctx->transport = transport; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(tgt, + _spdk_nvmf_tgt_add_transport, + ctx, + _spdk_nvmf_tgt_add_transport_done); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_tgt_find_subsystem(struct spdk_nvmf_tgt *tgt, const char *subnqn) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (!subnqn) { + return NULL; + } + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem == NULL) { + continue; + } + + if (strcmp(subnqn, subsystem->subnqn) == 0) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_transport * +spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, enum spdk_nvme_transport_type type) +{ + struct spdk_nvmf_transport *transport; + + TAILQ_FOREACH(transport, &tgt->transports, link) { + if (transport->ops->type == type) { + return transport; + } + } + + return NULL; +} + +void +spdk_nvmf_tgt_accept(struct spdk_nvmf_tgt *tgt, new_qpair_fn cb_fn) +{ + struct spdk_nvmf_transport *transport, *tmp; + + TAILQ_FOREACH_SAFE(transport, &tgt->transports, link, tmp) { + spdk_nvmf_transport_accept(transport, cb_fn); + } +} + +struct spdk_nvmf_poll_group * +spdk_nvmf_poll_group_create(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_io_channel *ch; + + ch = spdk_get_io_channel(tgt); + if (!ch) { + SPDK_ERRLOG("Unable to get I/O channel for target\n"); + return NULL; + } + + return spdk_io_channel_get_ctx(ch); +} + +void +spdk_nvmf_poll_group_destroy(struct spdk_nvmf_poll_group *group) +{ + /* This function will put the io_channel associated with this poll group */ + spdk_nvmf_tgt_destroy_poll_group_qpairs(group); +} + +int +spdk_nvmf_poll_group_add(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + int rc = -1; + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_INIT(&qpair->outstanding); + qpair->group = group; + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVATING); + + TAILQ_INSERT_TAIL(&group->qpairs, qpair, link); + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == qpair->transport) { + rc = spdk_nvmf_transport_poll_group_add(tgroup, qpair); + break; + } + } + + if (rc == 0) { + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_ACTIVE); + } else { + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_INACTIVE); + } + + return rc; +} + +static +void _nvmf_ctrlr_destruct(void *ctx) +{ + struct spdk_nvmf_ctrlr *ctrlr = ctx; + + spdk_nvmf_ctrlr_destruct(ctrlr); +} + +static void +_spdk_nvmf_ctrlr_free_from_qpair(void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_ctrlr *ctrlr = qpair_ctx->ctrlr; + uint32_t count; + + spdk_bit_array_clear(ctrlr->qpair_mask, qpair_ctx->qid); + count = spdk_bit_array_count_set(ctrlr->qpair_mask); + if (count == 0) { + spdk_bit_array_free(&ctrlr->qpair_mask); + + spdk_thread_send_msg(ctrlr->subsys->thread, _nvmf_ctrlr_destruct, ctrlr); + } + + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); +} + +static void +_spdk_nvmf_qpair_destroy(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx = ctx; + struct spdk_nvmf_qpair *qpair = qpair_ctx->qpair; + struct spdk_nvmf_ctrlr *ctrlr = qpair->ctrlr; + + assert(qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING); + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_INACTIVE); + qpair_ctx->qid = qpair->qid; + + TAILQ_REMOVE(&qpair->group->qpairs, qpair, link); + qpair->group = NULL; + + spdk_nvmf_transport_qpair_fini(qpair); + + if (!ctrlr || !ctrlr->thread) { + if (qpair_ctx->cb_fn) { + spdk_thread_send_msg(qpair_ctx->thread, qpair_ctx->cb_fn, qpair_ctx->ctx); + } + free(qpair_ctx); + return; + } + + qpair_ctx->ctrlr = ctrlr; + spdk_thread_send_msg(ctrlr->thread, _spdk_nvmf_ctrlr_free_from_qpair, qpair_ctx); + +} + +int +spdk_nvmf_qpair_disconnect(struct spdk_nvmf_qpair *qpair, nvmf_qpair_disconnect_cb cb_fn, void *ctx) +{ + struct nvmf_qpair_disconnect_ctx *qpair_ctx; + + /* If we get a qpair in the uninitialized state, we can just destroy it immediately */ + if (qpair->state == SPDK_NVMF_QPAIR_UNINITIALIZED) { + spdk_nvmf_transport_qpair_fini(qpair); + if (cb_fn) { + cb_fn(ctx); + } + return 0; + } + + /* The queue pair must be disconnected from the thread that owns it */ + assert(qpair->group->thread == spdk_get_thread()); + + if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING || + qpair->state == SPDK_NVMF_QPAIR_INACTIVE) { + /* This can occur if the connection is killed by the target, + * which results in a notification that the connection + * died. Send a message to defer the processing of this + * callback. This allows the stack to unwind in the case + * where a bunch of connections are disconnected in + * a loop. */ + if (cb_fn) { + spdk_thread_send_msg(qpair->group->thread, cb_fn, ctx); + } + return 0; + } + + assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE); + spdk_nvmf_qpair_set_state(qpair, SPDK_NVMF_QPAIR_DEACTIVATING); + + qpair_ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_ctx)); + if (!qpair_ctx) { + SPDK_ERRLOG("Unable to allocate context for nvmf_qpair_disconnect\n"); + return -ENOMEM; + } + + qpair_ctx->qpair = qpair; + qpair_ctx->cb_fn = cb_fn; + qpair_ctx->thread = qpair->group->thread; + qpair_ctx->ctx = ctx; + + /* Check for outstanding I/O */ + if (!TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb = _spdk_nvmf_qpair_destroy; + qpair->state_cb_arg = qpair_ctx; + spdk_nvmf_qpair_free_aer(qpair); + return 0; + } + + _spdk_nvmf_qpair_destroy(qpair_ctx, 0); + + return 0; +} + +int +spdk_nvmf_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_peer_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_local_trid(qpair, trid); +} + +int +spdk_nvmf_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return spdk_nvmf_transport_qpair_get_listen_trid(qpair, trid); +} + +int +spdk_nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *tgroup; + + TAILQ_FOREACH(tgroup, &group->tgroups, link) { + if (tgroup->transport == transport) { + /* Transport already in the poll group */ + return 0; + } + } + + tgroup = spdk_nvmf_transport_poll_group_create(transport); + if (!tgroup) { + SPDK_ERRLOG("Unable to create poll group for transport\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&group->tgroups, tgroup, link); + + return 0; +} + +static int +poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + uint32_t new_num_channels, old_num_channels; + uint32_t i; + struct spdk_nvmf_ns *ns; + + /* Make sure our poll group has memory for this subsystem allocated */ + if (subsystem->id >= group->num_sgroups) { + return -ENOMEM; + } + + sgroup = &group->sgroups[subsystem->id]; + + /* Make sure the array of channels is the correct size */ + new_num_channels = subsystem->max_nsid; + old_num_channels = sgroup->num_channels; + + if (old_num_channels == 0) { + if (new_num_channels > 0) { + /* First allocation */ + sgroup->channels = calloc(new_num_channels, sizeof(sgroup->channels[0])); + if (!sgroup->channels) { + return -ENOMEM; + } + } + } else if (new_num_channels > old_num_channels) { + void *buf; + + /* Make the array larger */ + buf = realloc(sgroup->channels, new_num_channels * sizeof(sgroup->channels[0])); + if (!buf) { + return -ENOMEM; + } + + sgroup->channels = buf; + + /* Null out the new channels slots */ + for (i = old_num_channels; i < new_num_channels; i++) { + sgroup->channels[i] = NULL; + } + } else if (new_num_channels < old_num_channels) { + void *buf; + + /* Free the extra I/O channels */ + for (i = new_num_channels; i < old_num_channels; i++) { + if (sgroup->channels[i]) { + spdk_put_io_channel(sgroup->channels[i]); + sgroup->channels[i] = NULL; + } + } + + /* Make the array smaller */ + if (new_num_channels > 0) { + buf = realloc(sgroup->channels, new_num_channels * sizeof(sgroup->channels[0])); + if (!buf) { + return -ENOMEM; + } + sgroup->channels = buf; + } else { + free(sgroup->channels); + sgroup->channels = NULL; + } + } + + sgroup->num_channels = new_num_channels; + + /* Detect bdevs that were added or removed */ + for (i = 0; i < sgroup->num_channels; i++) { + ns = subsystem->ns[i]; + if (ns == NULL && sgroup->channels[i] == NULL) { + /* Both NULL. Leave empty */ + } else if (ns == NULL && sgroup->channels[i] != NULL) { + /* There was a channel here, but the namespace is gone. */ + spdk_put_io_channel(sgroup->channels[i]); + sgroup->channels[i] = NULL; + } else if (ns != NULL && sgroup->channels[i] == NULL) { + /* A namespace appeared but there is no channel yet */ + sgroup->channels[i] = spdk_bdev_get_io_channel(ns->desc); + if (sgroup->channels[i] == NULL) { + SPDK_ERRLOG("Could not allocate I/O channel.\n"); + return -ENOMEM; + } + } else { + /* A namespace was present before and didn't change. */ + } + } + + return 0; +} + +int +spdk_nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem) +{ + return poll_group_update_subsystem(group, subsystem); +} + +int +spdk_nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + int rc = 0; + struct spdk_nvmf_subsystem_poll_group *sgroup = &group->sgroups[subsystem->id]; + + TAILQ_INIT(&sgroup->queued); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + spdk_nvmf_poll_group_remove_subsystem(group, subsystem, NULL, NULL); + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } + + return rc; +} + +static void +_nvmf_poll_group_remove_subsystem_cb(void *ctx, int status) +{ + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem_poll_group *sgroup; + spdk_nvmf_poll_group_mod_done cpl_fn = NULL; + void *cpl_ctx = NULL; + uint32_t nsid; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + cpl_fn = qpair_ctx->cpl_fn; + cpl_ctx = qpair_ctx->cpl_ctx; + sgroup = &group->sgroups[subsystem->id]; + + if (status) { + goto fini; + } + + for (nsid = 0; nsid < sgroup->num_channels; nsid++) { + if (sgroup->channels[nsid]) { + spdk_put_io_channel(sgroup->channels[nsid]); + sgroup->channels[nsid] = NULL; + } + } + + sgroup->num_channels = 0; + free(sgroup->channels); + sgroup->channels = NULL; +fini: + free(qpair_ctx); + if (cpl_fn) { + cpl_fn(cpl_ctx, status); + } +} + +static void +_nvmf_subsystem_disconnect_next_qpair(void *ctx) +{ + struct spdk_nvmf_qpair *qpair; + struct nvmf_qpair_disconnect_many_ctx *qpair_ctx = ctx; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_poll_group *group; + int rc = 0; + + group = qpair_ctx->group; + subsystem = qpair_ctx->subsystem; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr->subsys == subsystem) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, qpair_ctx); + } + + if (!qpair || rc != 0) { + _nvmf_poll_group_remove_subsystem_cb(ctx, rc); + } + return; +} + +void +spdk_nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_subsystem_poll_group *sgroup; + struct nvmf_qpair_disconnect_many_ctx *ctx; + int rc = 0; + + ctx = calloc(1, sizeof(struct nvmf_qpair_disconnect_many_ctx)); + + if (!ctx) { + SPDK_ERRLOG("Unable to allocate memory for context to remove poll subsystem\n"); + goto fini; + } + + ctx->group = group; + ctx->subsystem = subsystem; + ctx->cpl_fn = cb_fn; + ctx->cpl_ctx = cb_arg; + + sgroup = &group->sgroups[subsystem->id]; + sgroup->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + + TAILQ_FOREACH(qpair, &group->qpairs, link) { + if (qpair->ctrlr->subsys == subsystem) { + break; + } + } + + if (qpair) { + rc = spdk_nvmf_qpair_disconnect(qpair, _nvmf_subsystem_disconnect_next_qpair, ctx); + } else { + /* call the callback immediately. It will handle any channel iteration */ + _nvmf_poll_group_remove_subsystem_cb(ctx, 0); + } + + if (rc != 0) { + free(ctx); + goto fini; + } + + return; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +spdk_nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + if (sgroup == NULL) { + rc = -1; + goto fini; + } + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_ACTIVE); + /* TODO: This currently does not quiesce I/O */ + sgroup->state = SPDK_NVMF_SUBSYSTEM_PAUSED; +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} + +void +spdk_nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg) +{ + struct spdk_nvmf_request *req, *tmp; + struct spdk_nvmf_subsystem_poll_group *sgroup; + int rc = 0; + + if (subsystem->id >= group->num_sgroups) { + rc = -1; + goto fini; + } + + sgroup = &group->sgroups[subsystem->id]; + + assert(sgroup->state == SPDK_NVMF_SUBSYSTEM_PAUSED); + + rc = poll_group_update_subsystem(group, subsystem); + if (rc) { + goto fini; + } + + sgroup->state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + + /* Release all queued requests */ + TAILQ_FOREACH_SAFE(req, &sgroup->queued, link, tmp) { + TAILQ_REMOVE(&sgroup->queued, req, link); + spdk_nvmf_request_exec(req); + } +fini: + if (cb_fn) { + cb_fn(cb_arg, rc); + } +} diff --git a/src/spdk/lib/nvmf/nvmf_fc.h b/src/spdk/lib/nvmf/nvmf_fc.h new file mode 100644 index 00000000..bf086831 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_fc.h @@ -0,0 +1,871 @@ +/* + * BSD LICENSE + * + * Copyright (c) 2018 Broadcom. All Rights Reserved. + * The term "Broadcom" refers to Broadcom Inc. and/or its subsidiaries. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_FC_H__ +#define __NVMF_FC_H__ + +#include "spdk/nvmf.h" +#include "spdk/assert.h" +#include "spdk/nvme_spec.h" +#include "spdk/nvmf_fc_spec.h" +#include "spdk/event.h" +#include "spdk/io_channel.h" +#include "nvmf_internal.h" + +#define SPDK_NVMF_FC_TR_ADDR_LEN 64 + +/* + * FC HW port states. + */ +enum spdk_fc_port_state { + SPDK_FC_PORT_OFFLINE = 0, + SPDK_FC_PORT_ONLINE = 1, + SPDK_FC_PORT_QUIESCED = 2, +}; + +enum spdk_fc_hwqp_state { + SPDK_FC_HWQP_OFFLINE = 0, + SPDK_FC_HWQP_ONLINE = 1, +}; + +/* + * NVMF BCM FC Object state + * Add all the generic states of the object here. + * Specific object states can be added separately + */ +enum spdk_nvmf_fc_object_state { + SPDK_NVMF_FC_OBJECT_CREATED = 0, + SPDK_NVMF_FC_OBJECT_TO_BE_DELETED = 1, + SPDK_NVMF_FC_OBJECT_ZOMBIE = 2, /* Partial Create or Delete */ +}; + +/* + * FC request state + */ +enum spdk_nvmf_fc_request_state { + SPDK_NVMF_FC_REQ_INIT = 0, + SPDK_NVMF_FC_REQ_READ_BDEV, + SPDK_NVMF_FC_REQ_READ_XFER, + SPDK_NVMF_FC_REQ_READ_RSP, + SPDK_NVMF_FC_REQ_WRITE_BUFFS, + SPDK_NVMF_FC_REQ_WRITE_XFER, + SPDK_NVMF_FC_REQ_WRITE_BDEV, + SPDK_NVMF_FC_REQ_WRITE_RSP, + SPDK_NVMF_FC_REQ_NONE_BDEV, + SPDK_NVMF_FC_REQ_NONE_RSP, + SPDK_NVMF_FC_REQ_SUCCESS, + SPDK_NVMF_FC_REQ_FAILED, + SPDK_NVMF_FC_REQ_ABORTED, + SPDK_NVMF_FC_REQ_PENDING, + SPDK_NVMF_FC_REQ_MAX_STATE, +}; + +/* + * FC HWQP pointer + */ +typedef void *spdk_nvmf_fc_lld_hwqp_t; + +/* + * FC World Wide Name + */ +struct spdk_nvmf_fc_wwn { + union { + uint64_t wwn; /* World Wide Names consist of eight bytes */ + uint8_t octets[sizeof(uint64_t)]; + } u; +}; + +/* + * Generic DMA buffer descriptor + */ +struct spdk_nvmf_fc_buffer_desc { + void *virt; + uint64_t phys; + size_t len; + + /* Internal */ + uint32_t buf_index; +}; + +/* + * ABTS hadling context + */ +struct spdk_nvmf_fc_abts_ctx { + bool handled; + uint16_t hwqps_responded; + uint16_t rpi; + uint16_t oxid; + uint16_t rxid; + struct spdk_nvmf_fc_nport *nport; + uint16_t nport_hdl; + uint8_t port_hdl; + void *abts_poller_args; + void *sync_poller_args; + int num_hwqps; + bool queue_synced; + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *ls_hwqp; + uint16_t fcp_rq_id; +}; + +/* + * NVME FC transport errors + */ +struct spdk_nvmf_fc_errors { + uint32_t no_xri; + uint32_t nport_invalid; + uint32_t unknown_frame; + uint32_t wqe_cmplt_err; + uint32_t wqe_write_err; + uint32_t rq_status_err; + uint32_t rq_buf_len_err; + uint32_t rq_id_err; + uint32_t rq_index_err; + uint32_t invalid_cq_type; + uint32_t invalid_cq_id; + uint32_t fc_req_buf_err; + uint32_t aq_buf_alloc_err; + uint32_t write_buf_alloc_err; + uint32_t read_buf_alloc_err; + uint32_t unexpected_err; + uint32_t nvme_cmd_iu_err; + uint32_t nvme_cmd_xfer_err; + uint32_t queue_entry_invalid; + uint32_t invalid_conn_err; + uint32_t fcp_rsp_failure; + uint32_t write_failed; + uint32_t read_failed; + uint32_t rport_invalid; + uint32_t num_aborted; + uint32_t num_abts_sent; +}; + +/* + * Send Single Request/Response Sequence. + */ +struct spdk_nvmf_fc_send_srsr { + struct spdk_nvmf_fc_buffer_desc rqst; + struct spdk_nvmf_fc_buffer_desc rsp; + struct spdk_nvmf_fc_buffer_desc sgl; /* Note: Len = (2 * bcm_sge_t) */ + uint16_t rpi; +}; + +/* + * Struct representing a nport + */ +struct spdk_nvmf_fc_nport { + + uint16_t nport_hdl; + uint8_t port_hdl; + uint32_t d_id; + enum spdk_nvmf_fc_object_state nport_state; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + + /* list of remote ports (i.e. initiators) connected to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_remote_port_info) rem_port_list; + uint32_t rport_count; + + void *vendor_data; /* available for vendor use */ + + /* list of associations to nport */ + TAILQ_HEAD(, spdk_nvmf_fc_association) fc_associations; + uint32_t assoc_count; + struct spdk_nvmf_fc_port *fc_port; + TAILQ_ENTRY(spdk_nvmf_fc_nport) link; /* list of nports on a hw port. */ +}; + +/* + * NVMF FC Connection + */ +struct spdk_nvmf_fc_conn { + struct spdk_nvmf_qpair qpair; + + uint64_t conn_id; + struct spdk_nvmf_fc_hwqp *hwqp; + uint16_t esrp_ratio; + uint16_t rsp_count; + uint32_t rsn; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + uint16_t max_rw_depth; + /* The current number of I/O outstanding on this connection. This number + * includes all I/O from the time the capsule is first received until it is + * completed. + */ + uint16_t cur_queue_depth; + + /* number of read/write requests that are outstanding */ + uint16_t cur_fc_rw_depth; + + /* requests that are waiting to obtain xri/buffer */ + TAILQ_HEAD(, spdk_nvmf_fc_request) pending_queue; + + struct spdk_nvmf_fc_association *fc_assoc; + + /* additional FC info here - TBD */ + uint16_t rpi; + + /* for association's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_link; + + /* for assocations's available connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) assoc_avail_link; + + /* for hwqp's connection list */ + TAILQ_ENTRY(spdk_nvmf_fc_conn) link; +}; + +/* + * Structure for maintaining the XRI's + */ +struct spdk_nvmf_fc_xri { + uint32_t xri; /* The actual xri value */ + /* Internal */ + TAILQ_ENTRY(spdk_nvmf_fc_xri) link; + bool is_active; +}; + +struct spdk_nvmf_fc_poll_group; + +/* + * HWQP poller structure passed from Master thread + */ +struct spdk_nvmf_fc_hwqp { + uint32_t lcore_id; /* core hwqp is running on (for tracing purposes only) */ + struct spdk_thread *thread; /* thread hwqp is running on */ + uint32_t hwqp_id; /* A unique id (per physical port) for a hwqp */ + uint32_t rq_size; /* receive queue size */ + spdk_nvmf_fc_lld_hwqp_t queues; /* vendor HW queue set */ + struct spdk_nvmf_fc_port *fc_port; /* HW port structure for these queues */ + struct spdk_nvmf_fc_poll_group *poll_group; + + void *context; /* Vendor Context */ + + TAILQ_HEAD(, spdk_nvmf_fc_conn) connection_list; + uint32_t num_conns; /* number of connections to queue */ + uint16_t cid_cnt; /* used to generate unique conn. id for RQ */ + uint32_t free_q_slots; /* free q slots available for connections */ + enum spdk_fc_hwqp_state state; /* Poller state (e.g. online, offline) */ + + /* Internal */ + struct spdk_mempool *fc_request_pool; + TAILQ_HEAD(, spdk_nvmf_fc_request) in_use_reqs; + + TAILQ_HEAD(, spdk_nvmf_fc_xri) pending_xri_list; + + struct spdk_nvmf_fc_errors counters; + uint32_t send_frame_xri; + uint8_t send_frame_seqid; + + /* Pending LS request waiting for XRI. */ + TAILQ_HEAD(, spdk_nvmf_fc_ls_rqst) ls_pending_queue; + + /* Sync req list */ + TAILQ_HEAD(, spdk_nvmf_fc_poller_api_queue_sync_args) sync_cbs; + + TAILQ_ENTRY(spdk_nvmf_fc_hwqp) link; +}; + +struct spdk_nvmf_fc_ls_rsrc_pool { + void *assocs_mptr; + uint32_t assocs_count; + TAILQ_HEAD(, spdk_nvmf_fc_association) assoc_free_list; + + void *conns_mptr; + uint32_t conns_count; + TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conn_free_list; +}; + +/* + * FC HW port. + */ +struct spdk_nvmf_fc_port { + uint8_t port_hdl; + enum spdk_fc_port_state hw_port_status; + uint32_t xri_base; + uint32_t xri_count; + uint16_t fcp_rq_id; + struct spdk_ring *xri_ring; + struct spdk_nvmf_fc_hwqp ls_queue; + uint32_t num_io_queues; + struct spdk_nvmf_fc_hwqp *io_queues; + /* + * List of nports on this HW port. + */ + TAILQ_HEAD(, spdk_nvmf_fc_nport)nport_list; + int num_nports; + TAILQ_ENTRY(spdk_nvmf_fc_port) link; + + struct spdk_nvmf_fc_ls_rsrc_pool ls_rsrc_pool; + struct spdk_mempool *io_rsrc_pool; /* Pools to store bdev_io's for this port */ + void *port_ctx; +}; + +/* + * NVMF FC Request + */ +struct spdk_nvmf_fc_request { + struct spdk_nvmf_request req; + struct spdk_nvmf_fc_ersp_iu ersp; + uint32_t poller_lcore; /* for tracing purposes only */ + struct spdk_thread *poller_thread; + uint16_t buf_index; + struct spdk_nvmf_fc_xri *xri; + uint16_t oxid; + uint16_t rpi; + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + int state; + uint32_t transfered_len; + bool is_aborted; + uint32_t magic; + uint32_t s_id; + uint32_t d_id; + TAILQ_ENTRY(spdk_nvmf_fc_request) link; + TAILQ_ENTRY(spdk_nvmf_fc_request) pending_link; + TAILQ_HEAD(, spdk_nvmf_fc_caller_ctx) abort_cbs; +}; + +SPDK_STATIC_ASSERT(!offsetof(struct spdk_nvmf_fc_request, req), + "FC request and NVMF request address don't match."); + +/* + * NVMF FC Association + */ +struct spdk_nvmf_fc_association { + uint64_t assoc_id; + uint32_t s_id; + struct spdk_nvmf_fc_nport *tgtport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_subsystem *subsystem; + struct spdk_nvmf_host *host; + enum spdk_nvmf_fc_object_state assoc_state; + + char host_id[FCNVME_ASSOC_HOSTID_LEN]; + char host_nqn[FCNVME_ASSOC_HOSTNQN_LEN]; + char sub_nqn[FCNVME_ASSOC_HOSTNQN_LEN]; + + struct spdk_nvmf_fc_conn *aq_conn; /* connection for admin queue */ + + uint16_t conn_count; + TAILQ_HEAD(, spdk_nvmf_fc_conn) fc_conns; + + void *conns_buf; + TAILQ_HEAD(, spdk_nvmf_fc_conn) avail_fc_conns; + + TAILQ_ENTRY(spdk_nvmf_fc_association) link; + + /* for port's association free list */ + TAILQ_ENTRY(spdk_nvmf_fc_association) port_free_assoc_list_link; + + void *ls_del_op_ctx; /* delete assoc. callback list */ + + /* req/resp buffers used to send disconnect to initiator */ + struct spdk_nvmf_fc_send_srsr snd_disconn_bufs; +}; + +/* + * FC Remote Port + */ +struct spdk_nvmf_fc_remote_port_info { + uint32_t s_id; + uint32_t rpi; + uint32_t assoc_count; + struct spdk_nvmf_fc_wwn fc_nodename; + struct spdk_nvmf_fc_wwn fc_portname; + enum spdk_nvmf_fc_object_state rport_state; + TAILQ_ENTRY(spdk_nvmf_fc_remote_port_info) link; +}; + +/* + * Poller API error codes + */ +enum spdk_nvmf_fc_poller_api_ret { + SPDK_NVMF_FC_POLLER_API_SUCCESS = 0, + SPDK_NVMF_FC_POLLER_API_ERROR, + SPDK_NVMF_FC_POLLER_API_INVALID_ARG, + SPDK_NVMF_FC_POLLER_API_NO_CONN_ID, + SPDK_NVMF_FC_POLLER_API_DUP_CONN_ID, + SPDK_NVMF_FC_POLLER_API_OXID_NOT_FOUND, +}; + +/* + * Poller API definitions + */ +enum spdk_nvmf_fc_poller_api { + SPDK_NVMF_FC_POLLER_API_ADD_CONNECTION, + SPDK_NVMF_FC_POLLER_API_DEL_CONNECTION, + SPDK_NVMF_FC_POLLER_API_QUIESCE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ACTIVATE_QUEUE, + SPDK_NVMF_FC_POLLER_API_ABTS_RECEIVED, + SPDK_NVMF_FC_POLLER_API_ADAPTER_EVENT, + SPDK_NVMF_FC_POLLER_API_AEN, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC, + SPDK_NVMF_FC_POLLER_API_QUEUE_SYNC_DONE, +}; + +/* + * Poller API callback function proto + */ +typedef void (*spdk_nvmf_fc_poller_api_cb)(void *cb_data, enum spdk_nvmf_fc_poller_api_ret ret); + +/* + * Poller API callback data + */ +struct spdk_nvmf_fc_poller_api_cb_info { + spdk_nvmf_fc_poller_api_cb cb_func; + void *cb_data; + enum spdk_nvmf_fc_poller_api_ret ret; +}; + +/* + * Poller API structures + */ +struct spdk_nvmf_fc_poller_api_add_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_del_connection_args { + struct spdk_nvmf_fc_conn *fc_conn; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + bool send_abts; + /* internal */ + int fc_request_cnt; +}; + +struct spdk_nvmf_fc_poller_api_quiesce_queue_args { + void *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_activate_queue_args { + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_abts_recvd_args { + struct spdk_nvmf_fc_abts_ctx *ctx; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; +}; + +struct spdk_nvmf_fc_poller_api_queue_sync_done_args { + struct spdk_nvmf_fc_hwqp *hwqp; + uint64_t tag; +}; + +/* + * NVMF LS request structure + */ +struct spdk_nvmf_fc_ls_rqst { + struct spdk_nvmf_fc_buffer_desc rqstbuf; + struct spdk_nvmf_fc_buffer_desc rspbuf; + uint32_t rqst_len; + uint32_t rsp_len; + uint32_t rpi; + struct spdk_nvmf_fc_xri *xri; + uint16_t oxid; + void *private_data; /* for LLD only (LS does not touch) */ + TAILQ_ENTRY(spdk_nvmf_fc_ls_rqst) ls_pending_link; + uint32_t s_id; + uint32_t d_id; + struct spdk_nvmf_fc_nport *nport; + struct spdk_nvmf_fc_remote_port_info *rport; + struct spdk_nvmf_tgt *nvmf_tgt; +}; + +/* + * RQ Buffer LS Overlay Structure + */ +#define FCNVME_LS_RSVD_SIZE (FCNVME_MAX_LS_BUFFER_SIZE - \ + (sizeof(struct spdk_nvmf_fc_ls_rqst) + FCNVME_MAX_LS_REQ_SIZE + FCNVME_MAX_LS_RSP_SIZE)) + +struct __attribute__((__packed__)) spdk_nvmf_fc_rq_buf_ls_request { + uint8_t rqst[FCNVME_MAX_LS_REQ_SIZE]; + uint8_t resp[FCNVME_MAX_LS_RSP_SIZE]; + struct spdk_nvmf_fc_ls_rqst ls_rqst; + uint8_t rsvd[FCNVME_LS_RSVD_SIZE]; +}; + +SPDK_STATIC_ASSERT(sizeof(struct spdk_nvmf_fc_rq_buf_ls_request) == + FCNVME_MAX_LS_BUFFER_SIZE, "LS RQ Buffer overflow"); + + +struct spdk_nvmf_fc_poller_api_queue_sync_args { + uint64_t u_id; + struct spdk_nvmf_fc_hwqp *hwqp; + struct spdk_nvmf_fc_poller_api_cb_info cb_info; + + /* Used internally by poller */ + TAILQ_ENTRY(spdk_nvmf_fc_poller_api_queue_sync_args) link; +}; + +/* + * dump info + */ +struct spdk_nvmf_fc_queue_dump_info { + char *buffer; + int offset; +}; +#define SPDK_FC_HW_DUMP_BUF_SIZE (10 * 4096) + +static inline void +spdk_nvmf_fc_dump_buf_print(struct spdk_nvmf_fc_queue_dump_info *dump_info, char *fmt, ...) +{ + uint64_t buffer_size = SPDK_FC_HW_DUMP_BUF_SIZE; + int32_t avail = (int32_t)(buffer_size - dump_info->offset); + + if (avail > 0) { + va_list ap; + int32_t written; + + va_start(ap, fmt); + written = vsnprintf(dump_info->buffer + dump_info->offset, avail, fmt, ap); + if (written >= avail) { + dump_info->offset += avail; + } else { + dump_info->offset += written; + } + va_end(ap); + } +} + +/* + * NVMF FC caller callback definitions + */ +typedef void (*spdk_nvmf_fc_caller_cb)(void *hwqp, int32_t status, void *args); + +struct spdk_nvmf_fc_caller_ctx { + void *ctx; + spdk_nvmf_fc_caller_cb cb; + void *cb_args; + TAILQ_ENTRY(spdk_nvmf_fc_caller_ctx) link; +}; + +/* + * Low level FC driver function table (functions provided by vendor FC device driver) + */ +struct spdk_nvmf_fc_ll_drvr_ops { + + /* initialize the low level driver */ + int (*lld_init)(void); + + /* low level driver finish */ + void (*lld_fini)(void); + + /* initialize hw queues */ + int (*init_q)(struct spdk_nvmf_fc_hwqp *hwqp); + + void (*reinit_q)(spdk_nvmf_fc_lld_hwqp_t queues_prev, + spdk_nvmf_fc_lld_hwqp_t queues_curr); + + /* initialize hw queue buffers */ + int (*init_q_buffers)(struct spdk_nvmf_fc_hwqp *hwqp); + + /* poll the hw queues for requests */ + uint32_t (*poll_queue)(struct spdk_nvmf_fc_hwqp *hwqp); + + /* receive data (for data-in requests) */ + int (*recv_data)(struct spdk_nvmf_fc_request *fc_req); + + /* send data (for data-out requests) */ + int (*send_data)(struct spdk_nvmf_fc_request *fc_req); + + /* release hw queust buffer */ + void (*q_buffer_release)(struct spdk_nvmf_fc_hwqp *hwqp, uint16_t buff_idx); + + /* transmist nvme response */ + int (*xmt_rsp)(struct spdk_nvmf_fc_request *fc_req, uint8_t *ersp_buf, uint32_t ersp_len); + + /* transmist LS response */ + int (*xmt_ls_rsp)(struct spdk_nvmf_fc_nport *tgtport, struct spdk_nvmf_fc_ls_rqst *ls_rqst); + + /* issue abts */ + int (*issue_abort)(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_xri *xri, + bool send_abts, spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* transmit abts response */ + int (*xmt_bls_rsp)(struct spdk_nvmf_fc_hwqp *hwqp, uint16_t ox_id, uint16_t rx_id, uint16_t rpi, + bool rjt, uint8_t rjt_exp, spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* transmit single request - single response */ + int (*xmt_srsr_req)(struct spdk_nvmf_fc_hwqp *hwqp, struct spdk_nvmf_fc_send_srsr *srsr, + spdk_nvmf_fc_caller_cb cb, void *cb_args); + + /* issue queue marker (abts processing) */ + int (*issue_q_marker)(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t u_id, uint16_t skip_rq); + + /* assign a new connection to a hwqp (return connection ID) */ + struct spdk_nvmf_fc_hwqp *(*assign_conn_to_hwqp)( + struct spdk_nvmf_fc_hwqp *queues, uint32_t num_queues, + uint64_t *conn_id, uint32_t sq_size, bool for_aq); + + /* get the hwqp from the given connection id */ + struct spdk_nvmf_fc_hwqp *(*get_hwqp_from_conn_id)(struct spdk_nvmf_fc_hwqp *hwqp, + uint32_t num_queues, uint64_t conn_id); + + /* release connection ID (done with using it) */ + void (*release_conn)(struct spdk_nvmf_fc_hwqp *hwqp, uint64_t conn_id, uint32_t sq_size); + + /* dump all queue info into dump_info */ + void (*dump_all_queues)(struct spdk_nvmf_fc_hwqp *ls_queues, + struct spdk_nvmf_fc_hwqp *io_queues, + uint32_t num_queues, + struct spdk_nvmf_fc_queue_dump_info *dump_info); +}; + +extern struct spdk_nvmf_fc_ll_drvr_ops spdk_nvmf_fc_lld_ops; + +/* + * NVMF FC inline and function prototypes + */ + +static inline struct spdk_nvmf_fc_request * +spdk_nvmf_fc_get_fc_req(struct spdk_nvmf_request *req) +{ + return (struct spdk_nvmf_fc_request *) + ((uintptr_t)req - offsetof(struct spdk_nvmf_fc_request, req)); +} + +static inline bool +spdk_nvmf_fc_is_port_dead(struct spdk_nvmf_fc_hwqp *hwqp) +{ + switch (hwqp->fc_port->hw_port_status) { + case SPDK_FC_PORT_QUIESCED: + return true; + default: + return false; + } +} + +static inline bool +spdk_nvmf_fc_req_in_xfer(struct spdk_nvmf_fc_request *fc_req) +{ + switch (fc_req->state) { + case SPDK_NVMF_FC_REQ_READ_XFER: + case SPDK_NVMF_FC_REQ_READ_RSP: + case SPDK_NVMF_FC_REQ_WRITE_XFER: + case SPDK_NVMF_FC_REQ_WRITE_RSP: + case SPDK_NVMF_FC_REQ_NONE_RSP: + return true; + default: + return false; + } +} + +typedef void (*spdk_nvmf_fc_del_assoc_cb)(void *arg, uint32_t err); +int spdk_nvmf_fc_delete_association(struct spdk_nvmf_fc_nport *tgtport, + uint64_t assoc_id, bool send_abts, + spdk_nvmf_fc_del_assoc_cb del_assoc_cb, + void *cb_data); + +void spdk_nvmf_fc_ls_init(struct spdk_nvmf_fc_port *fc_port); + +void spdk_nvmf_fc_ls_fini(struct spdk_nvmf_fc_port *fc_port); + +struct spdk_nvmf_fc_port *spdk_nvmf_fc_port_list_get(uint8_t port_hdl); + +int spdk_nvmf_fc_nport_set_state(struct spdk_nvmf_fc_nport *nport, + enum spdk_nvmf_fc_object_state state); + +int spdk_nvmf_fc_assoc_set_state(struct spdk_nvmf_fc_association *assoc, + enum spdk_nvmf_fc_object_state state); + +bool spdk_nvmf_fc_nport_add_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +bool spdk_nvmf_fc_nport_remove_rem_port(struct spdk_nvmf_fc_nport *nport, + struct spdk_nvmf_fc_remote_port_info *rem_port); + +void spdk_nvmf_fc_init_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_reinit_poller_queues(struct spdk_nvmf_fc_hwqp *hwqp, + void *queues_curr); + +void spdk_nvmf_fc_init_poller(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_add_hwqp_to_poller(struct spdk_nvmf_fc_hwqp *hwqp, bool admin_q); + +void spdk_nvmf_fc_remove_hwqp_from_poller(struct spdk_nvmf_fc_hwqp *hwqp); + +bool spdk_nvmf_fc_port_is_offline(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_port_set_offline(struct spdk_nvmf_fc_port *fc_port); + +bool spdk_nvmf_fc_port_is_online(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_port_set_online(struct spdk_nvmf_fc_port *fc_port); + +int spdk_nvmf_fc_hwqp_port_set_online(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_hwqp_port_set_offline(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_rport_set_state(struct spdk_nvmf_fc_remote_port_info *rport, + enum spdk_nvmf_fc_object_state state); + +void spdk_nvmf_fc_port_list_add(struct spdk_nvmf_fc_port *fc_port); + +struct spdk_nvmf_fc_nport *spdk_nvmf_fc_nport_get(uint8_t port_hdl, uint16_t nport_hdl); + +int spdk_nvmf_fc_port_add_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +uint32_t spdk_nvmf_fc_nport_get_association_count(struct spdk_nvmf_fc_nport *nport); + +int spdk_nvmf_fc_port_remove_nport(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_nport *nport); + +uint32_t spdk_nvmf_fc_get_prli_service_params(void); + +bool spdk_nvmf_fc_nport_is_rport_empty(struct spdk_nvmf_fc_nport *nport); + +void spdk_nvmf_fc_handle_abts_frame(struct spdk_nvmf_fc_nport *nport, + uint16_t rpi, uint16_t oxid, + uint16_t rxid); + +void spdk_nvmf_fc_dump_all_queues(struct spdk_nvmf_fc_port *fc_port, + struct spdk_nvmf_fc_queue_dump_info *dump_info); + +void spdk_nvmf_fc_handle_ls_rqst(struct spdk_nvmf_fc_ls_rqst *ls_rqst); + +int spdk_nvmf_fc_xmt_ls_rsp(struct spdk_nvmf_fc_nport *tgtport, + struct spdk_nvmf_fc_ls_rqst *ls_rqst); + +struct spdk_nvmf_fc_nport *spdk_nvmf_bcm_req_fc_nport_get(struct spdk_nvmf_request *req); + +struct spdk_nvmf_fc_association *spdk_nvmf_fc_get_ctrlr_assoc(struct spdk_nvmf_ctrlr *ctrlr); + +bool spdk_nvmf_fc_nport_is_association_empty(struct spdk_nvmf_fc_nport *nport); + +int spdk_nvmf_fc_xmt_srsr_req(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_send_srsr *srsr, + spdk_nvmf_fc_caller_cb cb, void *cb_args); + +uint32_t spdk_nvmf_fc_get_num_nport_ctrlrs_in_subsystem(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_subsystem *subsys); + +bool spdk_nvmf_fc_is_spdk_ctrlr_on_nport(uint8_t port_hdl, uint16_t nport_hdl, + struct spdk_nvmf_ctrlr *ctrlr); + +int spdk_nvmf_fc_get_ctrlr_init_traddr(char *traddr, struct spdk_nvmf_ctrlr *ctrlr); + +uint32_t spdk_nvmf_fc_get_hwqp_id(struct spdk_nvmf_request *req); + +void spdk_nvmf_fc_req_abort(struct spdk_nvmf_fc_request *fc_req, + bool send_abts, spdk_nvmf_fc_caller_cb cb, + void *cb_args); + +int spdk_nvmf_fc_add_port_listen(void *arg1, void *arg2); + +int spdk_nvmf_fc_remove_port_listen(void *arg1, void *arg2); + +void spdk_nvmf_fc_subsys_connect_cb(void *cb_ctx, + struct spdk_nvmf_request *req); + +void spdk_nvmf_fc_subsys_disconnect_cb(void *cb_ctx, + struct spdk_nvmf_qpair *qpair); + +uint32_t spdk_nvmf_fc_get_master_lcore(void); + +struct spdk_thread *spdk_nvmf_fc_get_master_thread(void); + +/* + * These functions are used by low level FC driver + */ + +static inline struct spdk_nvmf_fc_conn * +spdk_nvmf_fc_get_conn(struct spdk_nvmf_qpair *qpair) +{ + return (struct spdk_nvmf_fc_conn *) + ((uintptr_t)qpair - offsetof(struct spdk_nvmf_fc_conn, qpair)); +} + +static inline uint16_t +spdk_nvmf_fc_advance_conn_sqhead(struct spdk_nvmf_qpair *qpair) +{ + /* advance sq_head pointer - wrap if needed */ + qpair->sq_head = (qpair->sq_head == qpair->sq_head_max) ? + 0 : (qpair->sq_head + 1); + return qpair->sq_head; +} + +static inline bool +spdk_nvmf_fc_use_send_frame(struct spdk_nvmf_request *req) +{ + /* For now use for only keepalives. */ + if (req->qpair->qid == 0 && + (req->cmd->nvme_cmd.opc == SPDK_NVME_OPC_KEEP_ALIVE)) { + return true; + } + return false; +} + +enum spdk_nvmf_fc_poller_api_ret spdk_nvmf_fc_poller_api_func( + struct spdk_nvmf_fc_hwqp *hwqp, + enum spdk_nvmf_fc_poller_api api, + void *api_args); + +int spdk_nvmf_fc_process_frame(struct spdk_nvmf_fc_hwqp *hwqp, uint32_t buff_idx, + struct spdk_nvmf_fc_frame_hdr *frame, + struct spdk_nvmf_fc_buffer_desc *buffer, uint32_t plen); + +void spdk_nvmf_fc_process_pending_req(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_process_pending_ls_rqst(struct spdk_nvmf_fc_hwqp *hwqp); + +void spdk_nvmf_fc_req_set_state(struct spdk_nvmf_fc_request *fc_req, + enum spdk_nvmf_fc_request_state state); + +void spdk_nvmf_fc_free_req(struct spdk_nvmf_fc_request *fc_req); + +void spdk_nvmf_fc_req_abort_complete(void *arg1); + +bool spdk_nvmf_fc_send_ersp_required(struct spdk_nvmf_fc_request *fc_req, + uint32_t rsp_cnt, uint32_t xfer_len); + +struct spdk_nvmf_fc_xri *spdk_nvmf_fc_get_xri(struct spdk_nvmf_fc_hwqp *hwqp); + +int spdk_nvmf_fc_put_xri(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_xri *xri); + +void spdk_nvmf_fc_release_xri(struct spdk_nvmf_fc_hwqp *hwqp, + struct spdk_nvmf_fc_xri *xri, bool xb, bool abts); + +int spdk_nvmf_fc_handle_rsp(struct spdk_nvmf_fc_request *req); +#endif diff --git a/src/spdk/lib/nvmf/nvmf_internal.h b/src/spdk/lib/nvmf/nvmf_internal.h new file mode 100644 index 00000000..c9c7bf36 --- /dev/null +++ b/src/spdk/lib/nvmf/nvmf_internal.h @@ -0,0 +1,333 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __NVMF_INTERNAL_H__ +#define __NVMF_INTERNAL_H__ + +#include "spdk/stdinc.h" + +#include "spdk/likely.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_spec.h" +#include "spdk/assert.h" +#include "spdk/bdev.h" +#include "spdk/queue.h" +#include "spdk/util.h" +#include "spdk/thread.h" + +#define SPDK_NVMF_MAX_SGL_ENTRIES 16 + +enum spdk_nvmf_subsystem_state { + SPDK_NVMF_SUBSYSTEM_INACTIVE = 0, + SPDK_NVMF_SUBSYSTEM_ACTIVATING, + SPDK_NVMF_SUBSYSTEM_ACTIVE, + SPDK_NVMF_SUBSYSTEM_PAUSING, + SPDK_NVMF_SUBSYSTEM_PAUSED, + SPDK_NVMF_SUBSYSTEM_RESUMING, + SPDK_NVMF_SUBSYSTEM_DEACTIVATING, +}; + +enum spdk_nvmf_qpair_state { + SPDK_NVMF_QPAIR_UNINITIALIZED = 0, + SPDK_NVMF_QPAIR_INACTIVE, + SPDK_NVMF_QPAIR_ACTIVATING, + SPDK_NVMF_QPAIR_ACTIVE, + SPDK_NVMF_QPAIR_DEACTIVATING, + SPDK_NVMF_QPAIR_ERROR, +}; + +typedef void (*spdk_nvmf_state_change_done)(void *cb_arg, int status); + +struct spdk_nvmf_tgt { + struct spdk_nvmf_tgt_opts opts; + + uint64_t discovery_genctr; + + /* Array of subsystem pointers of size max_subsystems indexed by sid */ + struct spdk_nvmf_subsystem **subsystems; + + struct spdk_nvmf_discovery_log_page *discovery_log_page; + size_t discovery_log_page_size; + TAILQ_HEAD(, spdk_nvmf_transport) transports; + + spdk_nvmf_tgt_destroy_done_fn *destroy_cb_fn; + void *destroy_cb_arg; +}; + +struct spdk_nvmf_host { + char *nqn; + TAILQ_ENTRY(spdk_nvmf_host) link; +}; + +struct spdk_nvmf_listener { + struct spdk_nvme_transport_id trid; + struct spdk_nvmf_transport *transport; + TAILQ_ENTRY(spdk_nvmf_listener) link; +}; + +struct spdk_nvmf_transport_poll_group { + struct spdk_nvmf_transport *transport; + TAILQ_ENTRY(spdk_nvmf_transport_poll_group) link; +}; + +struct spdk_nvmf_subsystem_poll_group { + /* Array of channels for each namespace indexed by nsid - 1 */ + struct spdk_io_channel **channels; + uint32_t num_channels; + + enum spdk_nvmf_subsystem_state state; + + TAILQ_HEAD(, spdk_nvmf_request) queued; +}; + +struct spdk_nvmf_poll_group { + struct spdk_thread *thread; + struct spdk_poller *poller; + + TAILQ_HEAD(, spdk_nvmf_transport_poll_group) tgroups; + + /* Array of poll groups indexed by subsystem id (sid) */ + struct spdk_nvmf_subsystem_poll_group *sgroups; + uint32_t num_sgroups; + + /* All of the queue pairs that belong to this poll group */ + TAILQ_HEAD(, spdk_nvmf_qpair) qpairs; +}; + +typedef enum _spdk_nvmf_request_exec_status { + SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE, + SPDK_NVMF_REQUEST_EXEC_STATUS_ASYNCHRONOUS, +} spdk_nvmf_request_exec_status; + +union nvmf_h2c_msg { + struct spdk_nvmf_capsule_cmd nvmf_cmd; + struct spdk_nvme_cmd nvme_cmd; + struct spdk_nvmf_fabric_prop_set_cmd prop_set_cmd; + struct spdk_nvmf_fabric_prop_get_cmd prop_get_cmd; + struct spdk_nvmf_fabric_connect_cmd connect_cmd; +}; +SPDK_STATIC_ASSERT(sizeof(union nvmf_h2c_msg) == 64, "Incorrect size"); + +union nvmf_c2h_msg { + struct spdk_nvme_cpl nvme_cpl; + struct spdk_nvmf_fabric_prop_get_rsp prop_get_rsp; + struct spdk_nvmf_fabric_connect_rsp connect_rsp; +}; +SPDK_STATIC_ASSERT(sizeof(union nvmf_c2h_msg) == 16, "Incorrect size"); + +struct spdk_nvmf_request { + struct spdk_nvmf_qpair *qpair; + uint32_t length; + enum spdk_nvme_data_transfer xfer; + void *data; + union nvmf_h2c_msg *cmd; + union nvmf_c2h_msg *rsp; + struct iovec iov[SPDK_NVMF_MAX_SGL_ENTRIES]; + uint32_t iovcnt; + struct spdk_bdev_io_wait_entry bdev_io_wait; + + TAILQ_ENTRY(spdk_nvmf_request) link; +}; + +struct spdk_nvmf_ns { + struct spdk_nvmf_subsystem *subsystem; + struct spdk_bdev *bdev; + struct spdk_bdev_desc *desc; + struct spdk_nvmf_ns_opts opts; +}; + +struct spdk_nvmf_qpair { + enum spdk_nvmf_qpair_state state; + spdk_nvmf_state_change_done state_cb; + void *state_cb_arg; + + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_ctrlr *ctrlr; + struct spdk_nvmf_poll_group *group; + + uint16_t qid; + uint16_t sq_head; + uint16_t sq_head_max; + + TAILQ_HEAD(, spdk_nvmf_request) outstanding; + TAILQ_ENTRY(spdk_nvmf_qpair) link; +}; + +struct spdk_nvmf_ctrlr_feat { + union spdk_nvme_feat_arbitration arbitration; + union spdk_nvme_feat_power_management power_management; + union spdk_nvme_feat_error_recovery error_recovery; + union spdk_nvme_feat_volatile_write_cache volatile_write_cache; + union spdk_nvme_feat_number_of_queues number_of_queues; + union spdk_nvme_feat_write_atomicity write_atomicity; + union spdk_nvme_feat_async_event_configuration async_event_configuration; + union spdk_nvme_feat_keep_alive_timer keep_alive_timer; +}; + +/* + * This structure represents an NVMe-oF controller, + * which is like a "session" in networking terms. + */ +struct spdk_nvmf_ctrlr { + uint16_t cntlid; + struct spdk_nvmf_subsystem *subsys; + + struct { + union spdk_nvme_cap_register cap; + union spdk_nvme_vs_register vs; + union spdk_nvme_cc_register cc; + union spdk_nvme_csts_register csts; + } vcprop; /* virtual controller properties */ + + struct spdk_nvmf_ctrlr_feat feat; + + struct spdk_nvmf_qpair *admin_qpair; + struct spdk_thread *thread; + struct spdk_bit_array *qpair_mask; + + struct spdk_nvmf_request *aer_req; + union spdk_nvme_async_event_completion notice_event; + uint8_t hostid[16]; + + uint16_t changed_ns_list_count; + struct spdk_nvme_ns_list changed_ns_list; + + TAILQ_ENTRY(spdk_nvmf_ctrlr) link; +}; + +struct spdk_nvmf_subsystem { + struct spdk_thread *thread; + uint32_t id; + enum spdk_nvmf_subsystem_state state; + + char subnqn[SPDK_NVMF_NQN_MAX_LEN + 1]; + enum spdk_nvmf_subtype subtype; + uint16_t next_cntlid; + bool allow_any_host; + + struct spdk_nvmf_tgt *tgt; + + char sn[SPDK_NVME_CTRLR_SN_LEN + 1]; + + /* Array of pointers to namespaces of size max_nsid indexed by nsid - 1 */ + struct spdk_nvmf_ns **ns; + uint32_t max_nsid; + /* This is the maximum allowed nsid to a subsystem */ + uint32_t max_allowed_nsid; + + TAILQ_HEAD(, spdk_nvmf_ctrlr) ctrlrs; + + TAILQ_HEAD(, spdk_nvmf_host) hosts; + + TAILQ_HEAD(, spdk_nvmf_listener) listeners; + + TAILQ_ENTRY(spdk_nvmf_subsystem) entries; +}; + +typedef void(*spdk_nvmf_poll_group_mod_done)(void *cb_arg, int status); + +struct spdk_nvmf_transport *spdk_nvmf_tgt_get_transport(struct spdk_nvmf_tgt *tgt, + enum spdk_nvme_transport_type); + +int spdk_nvmf_poll_group_add_transport(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_transport *transport); +int spdk_nvmf_poll_group_update_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem); +int spdk_nvmf_poll_group_add_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_remove_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_pause_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_poll_group_resume_subsystem(struct spdk_nvmf_poll_group *group, + struct spdk_nvmf_subsystem *subsystem, spdk_nvmf_poll_group_mod_done cb_fn, void *cb_arg); +void spdk_nvmf_request_exec(struct spdk_nvmf_request *req); +int spdk_nvmf_request_free(struct spdk_nvmf_request *req); +int spdk_nvmf_request_complete(struct spdk_nvmf_request *req); + +void spdk_nvmf_get_discovery_log_page(struct spdk_nvmf_tgt *tgt, + void *buffer, uint64_t offset, + uint32_t length); + +void spdk_nvmf_ctrlr_destruct(struct spdk_nvmf_ctrlr *ctrlr); +int spdk_nvmf_ctrlr_process_fabrics_cmd(struct spdk_nvmf_request *req); +int spdk_nvmf_ctrlr_process_admin_cmd(struct spdk_nvmf_request *req); +int spdk_nvmf_ctrlr_process_io_cmd(struct spdk_nvmf_request *req); +bool spdk_nvmf_ctrlr_dsm_supported(struct spdk_nvmf_ctrlr *ctrlr); +bool spdk_nvmf_ctrlr_write_zeroes_supported(struct spdk_nvmf_ctrlr *ctrlr); +void spdk_nvmf_ctrlr_ns_changed(struct spdk_nvmf_ctrlr *ctrlr, uint32_t nsid); + +void spdk_nvmf_bdev_ctrlr_identify_ns(struct spdk_nvmf_ns *ns, struct spdk_nvme_ns_data *nsdata); + +int spdk_nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +void spdk_nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr); +struct spdk_nvmf_ctrlr *spdk_nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, + uint16_t cntlid); +int spdk_nvmf_ctrlr_async_event_ns_notice(struct spdk_nvmf_ctrlr *ctrlr); + +/* + * Abort aer is sent on a per controller basis and sends a completion for the aer to the host. + * This function should be called when attempting to recover in error paths when it is OK for + * the host to send a subsequent AER. + */ +void spdk_nvmf_ctrlr_abort_aer(struct spdk_nvmf_ctrlr *ctrlr); + +/* + * Free aer simply frees the rdma resources for the aer without informing the host. + * This function should be called when deleting a qpair when one wants to make sure + * the qpair is completely empty before freeing the request. The reason we free the + * AER without sending a completion is to prevent the host from sending another AER. + */ +void spdk_nvmf_qpair_free_aer(struct spdk_nvmf_qpair *qpair); + +static inline struct spdk_nvmf_ns * +_spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + /* NOTE: This implicitly also checks for 0, since 0 - 1 wraps around to UINT32_MAX. */ + if (spdk_unlikely(nsid - 1 >= subsystem->max_nsid)) { + return NULL; + } + + return subsystem->ns[nsid - 1]; +} + +static inline bool +spdk_nvmf_qpair_is_admin_queue(struct spdk_nvmf_qpair *qpair) +{ + return qpair->qid == 0; +} + +#endif /* __NVMF_INTERNAL_H__ */ diff --git a/src/spdk/lib/nvmf/rdma.c b/src/spdk/lib/nvmf/rdma.c new file mode 100644 index 00000000..333e703f --- /dev/null +++ b/src/spdk/lib/nvmf/rdma.c @@ -0,0 +1,2930 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include <infiniband/verbs.h> +#include <rdma/rdma_cma.h> +#include <rdma/rdma_verbs.h> + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/config.h" +#include "spdk/assert.h" +#include "spdk/thread.h" +#include "spdk/nvmf.h" +#include "spdk/nvmf_spec.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/util.h" + +#include "spdk_internal/log.h" + +/* + RDMA Connection Resource Defaults + */ +#define NVMF_DEFAULT_TX_SGE 1 +#define NVMF_DEFAULT_RX_SGE 2 +#define NVMF_DEFAULT_DATA_SGE 16 + +/* The RDMA completion queue size */ +#define NVMF_RDMA_CQ_SIZE 4096 + +/* AIO backend requires block size aligned data buffers, + * extra 4KiB aligned data buffer should work for most devices. + */ +#define SHIFT_4KB 12 +#define NVMF_DATA_BUFFER_ALIGNMENT (1 << SHIFT_4KB) +#define NVMF_DATA_BUFFER_MASK (NVMF_DATA_BUFFER_ALIGNMENT - 1) + +enum spdk_nvmf_rdma_request_state { + /* The request is not currently in use */ + RDMA_REQUEST_STATE_FREE = 0, + + /* Initial state when request first received */ + RDMA_REQUEST_STATE_NEW, + + /* The request is queued until a data buffer is available. */ + RDMA_REQUEST_STATE_NEED_BUFFER, + + /* The request is waiting on RDMA queue depth availability + * to transfer data between the host and the controller. + */ + RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, + + /* The request is currently transferring data from the host to the controller. */ + RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + + /* The request is ready to execute at the block device */ + RDMA_REQUEST_STATE_READY_TO_EXECUTE, + + /* The request is currently executing at the block device */ + RDMA_REQUEST_STATE_EXECUTING, + + /* The request finished executing at the block device */ + RDMA_REQUEST_STATE_EXECUTED, + + /* The request is ready to send a completion */ + RDMA_REQUEST_STATE_READY_TO_COMPLETE, + + /* The request is currently transferring data from the controller to the host. */ + RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + + /* The request currently has an outstanding completion without an + * associated data transfer. + */ + RDMA_REQUEST_STATE_COMPLETING, + + /* The request completed and can be marked free. */ + RDMA_REQUEST_STATE_COMPLETED, + + /* Terminator */ + RDMA_REQUEST_NUM_STATES, +}; + +#define OBJECT_NVMF_RDMA_IO 0x40 + +#define TRACE_GROUP_NVMF_RDMA 0x4 +#define TRACE_RDMA_REQUEST_STATE_NEW SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x0) +#define TRACE_RDMA_REQUEST_STATE_NEED_BUFFER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x1) +#define TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x2) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x3) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x4) +#define TRACE_RDMA_REQUEST_STATE_EXECUTING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x5) +#define TRACE_RDMA_REQUEST_STATE_EXECUTED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x6) +#define TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x7) +#define TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x8) +#define TRACE_RDMA_REQUEST_STATE_COMPLETING SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x9) +#define TRACE_RDMA_REQUEST_STATE_COMPLETED SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xA) +#define TRACE_RDMA_QP_CREATE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xB) +#define TRACE_RDMA_IBV_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xC) +#define TRACE_RDMA_CM_ASYNC_EVENT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xD) +#define TRACE_RDMA_QP_STATE_CHANGE SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xE) +#define TRACE_RDMA_QP_DISCONNECT SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0xF) +#define TRACE_RDMA_QP_DESTROY SPDK_TPOINT_ID(TRACE_GROUP_NVMF_RDMA, 0x10) + +SPDK_TRACE_REGISTER_FN(nvmf_trace) +{ + spdk_trace_register_object(OBJECT_NVMF_RDMA_IO, 'r'); + spdk_trace_register_description("RDMA_REQ_NEW", "", + TRACE_RDMA_REQUEST_STATE_NEW, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 1, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_NEED_BUFFER", "", + TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_PENDING_H_TO_C", "", + TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_TX_H_TO_C", "", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_EXECUTE", "", + TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTING", "", + TRACE_RDMA_REQUEST_STATE_EXECUTING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_EXECUTED", "", + TRACE_RDMA_REQUEST_STATE_EXECUTED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_RDY_TO_COMPLETE", "", + TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING_CONTROLLER_TO_HOST", "", + TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETING_INCAPSULE", "", + TRACE_RDMA_REQUEST_STATE_COMPLETING, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + spdk_trace_register_description("RDMA_REQ_COMPLETED", "", + TRACE_RDMA_REQUEST_STATE_COMPLETED, + OWNER_NONE, OBJECT_NVMF_RDMA_IO, 0, 1, "cmid: "); + + spdk_trace_register_description("RDMA_QP_CREATE", "", TRACE_RDMA_QP_CREATE, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_IBV_ASYNC_EVENT", "", TRACE_RDMA_IBV_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_CM_ASYNC_EVENT", "", TRACE_RDMA_CM_ASYNC_EVENT, + OWNER_NONE, OBJECT_NONE, 0, 0, "type: "); + spdk_trace_register_description("RDMA_QP_STATE_CHANGE", "", TRACE_RDMA_QP_STATE_CHANGE, + OWNER_NONE, OBJECT_NONE, 0, 1, "state: "); + spdk_trace_register_description("RDMA_QP_DISCONNECT", "", TRACE_RDMA_QP_DISCONNECT, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); + spdk_trace_register_description("RDMA_QP_DESTROY", "", TRACE_RDMA_QP_DESTROY, + OWNER_NONE, OBJECT_NONE, 0, 0, ""); +} + +/* This structure holds commands as they are received off the wire. + * It must be dynamically paired with a full request object + * (spdk_nvmf_rdma_request) to service a request. It is separate + * from the request because RDMA does not appear to order + * completions, so occasionally we'll get a new incoming + * command when there aren't any free request objects. + */ +struct spdk_nvmf_rdma_recv { + struct ibv_recv_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_RX_SGE]; + + struct spdk_nvmf_rdma_qpair *qpair; + + /* In-capsule data buffer */ + uint8_t *buf; + + TAILQ_ENTRY(spdk_nvmf_rdma_recv) link; +}; + +struct spdk_nvmf_rdma_request { + struct spdk_nvmf_request req; + bool data_from_pool; + + enum spdk_nvmf_rdma_request_state state; + + struct spdk_nvmf_rdma_recv *recv; + + struct { + struct ibv_send_wr wr; + struct ibv_sge sgl[NVMF_DEFAULT_TX_SGE]; + } rsp; + + struct { + struct ibv_send_wr wr; + struct ibv_sge sgl[SPDK_NVMF_MAX_SGL_ENTRIES]; + void *buffers[SPDK_NVMF_MAX_SGL_ENTRIES]; + } data; + + TAILQ_ENTRY(spdk_nvmf_rdma_request) link; + TAILQ_ENTRY(spdk_nvmf_rdma_request) state_link; +}; + +struct spdk_nvmf_rdma_qpair { + struct spdk_nvmf_qpair qpair; + + struct spdk_nvmf_rdma_port *port; + struct spdk_nvmf_rdma_poller *poller; + + struct rdma_cm_id *cm_id; + struct rdma_cm_id *listen_id; + + /* The maximum number of I/O outstanding on this connection at one time */ + uint16_t max_queue_depth; + + /* The maximum number of active RDMA READ and WRITE operations at one time */ + uint16_t max_rw_depth; + + /* Receives that are waiting for a request object */ + TAILQ_HEAD(, spdk_nvmf_rdma_recv) incoming_queue; + + /* Queues to track the requests in all states */ + TAILQ_HEAD(, spdk_nvmf_rdma_request) state_queue[RDMA_REQUEST_NUM_STATES]; + + /* Number of requests in each state */ + uint32_t state_cntr[RDMA_REQUEST_NUM_STATES]; + + int max_sge; + + /* Array of size "max_queue_depth" containing RDMA requests. */ + struct spdk_nvmf_rdma_request *reqs; + + /* Array of size "max_queue_depth" containing RDMA recvs. */ + struct spdk_nvmf_rdma_recv *recvs; + + /* Array of size "max_queue_depth" containing 64 byte capsules + * used for receive. + */ + union nvmf_h2c_msg *cmds; + struct ibv_mr *cmds_mr; + + /* Array of size "max_queue_depth" containing 16 byte completions + * to be sent back to the user. + */ + union nvmf_c2h_msg *cpls; + struct ibv_mr *cpls_mr; + + /* Array of size "max_queue_depth * InCapsuleDataSize" containing + * buffers to be used for in capsule data. + */ + void *bufs; + struct ibv_mr *bufs_mr; + + TAILQ_ENTRY(spdk_nvmf_rdma_qpair) link; + + /* Mgmt channel */ + struct spdk_io_channel *mgmt_channel; + struct spdk_nvmf_rdma_mgmt_channel *ch; + + /* IBV queue pair attributes: they are used to manage + * qp state and recover from errors. + */ + struct ibv_qp_init_attr ibv_init_attr; + struct ibv_qp_attr ibv_attr; + + bool qpair_disconnected; + + /* Reference counter for how many unprocessed messages + * from other threads are currently outstanding. The + * qpair cannot be destroyed until this is 0. This is + * atomically incremented from any thread, but only + * decremented and read from the thread that owns this + * qpair. + */ + uint32_t refcnt; +}; + +struct spdk_nvmf_rdma_poller { + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poll_group *group; + + struct ibv_cq *cq; + + TAILQ_HEAD(, spdk_nvmf_rdma_qpair) qpairs; + + TAILQ_ENTRY(spdk_nvmf_rdma_poller) link; +}; + +struct spdk_nvmf_rdma_poll_group { + struct spdk_nvmf_transport_poll_group group; + + TAILQ_HEAD(, spdk_nvmf_rdma_poller) pollers; +}; + +/* Assuming rdma_cm uses just one protection domain per ibv_context. */ +struct spdk_nvmf_rdma_device { + struct ibv_device_attr attr; + struct ibv_context *context; + + struct spdk_mem_map *map; + struct ibv_pd *pd; + + TAILQ_ENTRY(spdk_nvmf_rdma_device) link; +}; + +struct spdk_nvmf_rdma_port { + struct spdk_nvme_transport_id trid; + struct rdma_cm_id *id; + struct spdk_nvmf_rdma_device *device; + uint32_t ref; + TAILQ_ENTRY(spdk_nvmf_rdma_port) link; +}; + +struct spdk_nvmf_rdma_transport { + struct spdk_nvmf_transport transport; + + struct rdma_event_channel *event_channel; + + struct spdk_mempool *data_buf_pool; + + pthread_mutex_t lock; + + /* fields used to poll RDMA/IB events */ + nfds_t npoll_fds; + struct pollfd *poll_fds; + + TAILQ_HEAD(, spdk_nvmf_rdma_device) devices; + TAILQ_HEAD(, spdk_nvmf_rdma_port) ports; +}; + +struct spdk_nvmf_rdma_mgmt_channel { + /* Requests that are waiting to obtain a data buffer */ + TAILQ_HEAD(, spdk_nvmf_rdma_request) pending_data_buf_queue; +}; + +static inline void +spdk_nvmf_rdma_qpair_inc_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) +{ + __sync_fetch_and_add(&rqpair->refcnt, 1); +} + +static inline uint32_t +spdk_nvmf_rdma_qpair_dec_refcnt(struct spdk_nvmf_rdma_qpair *rqpair) +{ + uint32_t old_refcnt, new_refcnt; + + do { + old_refcnt = rqpair->refcnt; + assert(old_refcnt > 0); + new_refcnt = old_refcnt - 1; + } while (__sync_bool_compare_and_swap(&rqpair->refcnt, old_refcnt, new_refcnt) == false); + + return new_refcnt; +} + +/* API to IBV QueuePair */ +static const char *str_ibv_qp_state[] = { + "IBV_QPS_RESET", + "IBV_QPS_INIT", + "IBV_QPS_RTR", + "IBV_QPS_RTS", + "IBV_QPS_SQD", + "IBV_QPS_SQE", + "IBV_QPS_ERR" +}; + +static enum ibv_qp_state +spdk_nvmf_rdma_update_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair) { + enum ibv_qp_state old_state, new_state; + int rc; + + /* All the attributes needed for recovery */ + static int spdk_nvmf_ibv_attr_mask = + IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER | + IBV_QP_SQ_PSN | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_MAX_QP_RD_ATOMIC; + + old_state = rqpair->ibv_attr.qp_state; + rc = ibv_query_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, + spdk_nvmf_ibv_attr_mask, &rqpair->ibv_init_attr); + + if (rc) + { + SPDK_ERRLOG("Failed to get updated RDMA queue pair state!\n"); + assert(false); + } + + new_state = rqpair->ibv_attr.qp_state; + if (old_state != new_state) + { + spdk_trace_record(TRACE_RDMA_QP_STATE_CHANGE, 0, 0, + (uintptr_t)rqpair->cm_id, new_state); + } + return new_state; +} + +static int +spdk_nvmf_rdma_set_ibv_state(struct spdk_nvmf_rdma_qpair *rqpair, + enum ibv_qp_state new_state) +{ + int rc; + enum ibv_qp_state state; + static int attr_mask_rc[] = { + [IBV_QPS_RESET] = IBV_QP_STATE, + [IBV_QPS_INIT] = (IBV_QP_STATE | + IBV_QP_PKEY_INDEX | + IBV_QP_PORT | + IBV_QP_ACCESS_FLAGS), + [IBV_QPS_RTR] = (IBV_QP_STATE | + IBV_QP_AV | + IBV_QP_PATH_MTU | + IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | + IBV_QP_MAX_DEST_RD_ATOMIC | + IBV_QP_MIN_RNR_TIMER), + [IBV_QPS_RTS] = (IBV_QP_STATE | + IBV_QP_SQ_PSN | + IBV_QP_TIMEOUT | + IBV_QP_RETRY_CNT | + IBV_QP_RNR_RETRY | + IBV_QP_MAX_QP_RD_ATOMIC), + [IBV_QPS_SQD] = IBV_QP_STATE, + [IBV_QPS_SQE] = IBV_QP_STATE, + [IBV_QPS_ERR] = IBV_QP_STATE, + }; + + switch (new_state) { + case IBV_QPS_RESET: + case IBV_QPS_INIT: + case IBV_QPS_RTR: + case IBV_QPS_RTS: + case IBV_QPS_SQD: + case IBV_QPS_SQE: + case IBV_QPS_ERR: + break; + default: + SPDK_ERRLOG("QP#%d: bad state requested: %u\n", + rqpair->qpair.qid, new_state); + return -1; + } + rqpair->ibv_attr.cur_qp_state = rqpair->ibv_attr.qp_state; + rqpair->ibv_attr.qp_state = new_state; + rqpair->ibv_attr.ah_attr.port_num = rqpair->ibv_attr.port_num; + + rc = ibv_modify_qp(rqpair->cm_id->qp, &rqpair->ibv_attr, + attr_mask_rc[new_state]); + + if (rc) { + SPDK_ERRLOG("QP#%d: failed to set state to: %s, %d (%s)\n", + rqpair->qpair.qid, str_ibv_qp_state[new_state], errno, strerror(errno)); + return rc; + } + + state = spdk_nvmf_rdma_update_ibv_state(rqpair); + + if (state != new_state) { + SPDK_ERRLOG("QP#%d: expected state: %s, actual state: %s\n", + rqpair->qpair.qid, str_ibv_qp_state[new_state], + str_ibv_qp_state[state]); + return -1; + } + SPDK_NOTICELOG("IBV QP#%u changed to: %s\n", rqpair->qpair.qid, + str_ibv_qp_state[state]); + return 0; +} + +static void +spdk_nvmf_rdma_request_set_state(struct spdk_nvmf_rdma_request *rdma_req, + enum spdk_nvmf_rdma_request_state state) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + qpair = rdma_req->req.qpair; + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + TAILQ_REMOVE(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]--; + + rdma_req->state = state; + + TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]++; +} + +static int +spdk_nvmf_rdma_mgmt_channel_create(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; + + TAILQ_INIT(&ch->pending_data_buf_queue); + return 0; +} + +static void +spdk_nvmf_rdma_mgmt_channel_destroy(void *io_device, void *ctx_buf) +{ + struct spdk_nvmf_rdma_mgmt_channel *ch = ctx_buf; + + if (!TAILQ_EMPTY(&ch->pending_data_buf_queue)) { + SPDK_ERRLOG("Pending I/O list wasn't empty on channel destruction\n"); + } +} + +static int +spdk_nvmf_rdma_cur_rw_depth(struct spdk_nvmf_rdma_qpair *rqpair) +{ + return rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER] + + rqpair->state_cntr[RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST]; +} + +static int +spdk_nvmf_rdma_cur_queue_depth(struct spdk_nvmf_rdma_qpair *rqpair) +{ + return rqpair->max_queue_depth - + rqpair->state_cntr[RDMA_REQUEST_STATE_FREE]; +} + +static void +spdk_nvmf_rdma_qpair_destroy(struct spdk_nvmf_rdma_qpair *rqpair) +{ + spdk_trace_record(TRACE_RDMA_QP_DESTROY, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + if (spdk_nvmf_rdma_cur_queue_depth(rqpair)) { + rqpair->qpair_disconnected = true; + return; + } + + if (rqpair->refcnt > 0) { + return; + } + + if (rqpair->poller) { + TAILQ_REMOVE(&rqpair->poller->qpairs, rqpair, link); + } + + if (rqpair->cmds_mr) { + ibv_dereg_mr(rqpair->cmds_mr); + } + + if (rqpair->cpls_mr) { + ibv_dereg_mr(rqpair->cpls_mr); + } + + if (rqpair->bufs_mr) { + ibv_dereg_mr(rqpair->bufs_mr); + } + + if (rqpair->cm_id) { + rdma_destroy_qp(rqpair->cm_id); + rdma_destroy_id(rqpair->cm_id); + } + + if (rqpair->mgmt_channel) { + spdk_put_io_channel(rqpair->mgmt_channel); + } + + /* Free all memory */ + spdk_dma_free(rqpair->cmds); + spdk_dma_free(rqpair->cpls); + spdk_dma_free(rqpair->bufs); + free(rqpair->reqs); + free(rqpair->recvs); + free(rqpair); +} + +static int +spdk_nvmf_rdma_qpair_initialize(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_qpair *rqpair; + int rc, i; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_transport *transport; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + transport = &rtransport->transport; + + memset(&rqpair->ibv_init_attr, 0, sizeof(struct ibv_qp_init_attr)); + rqpair->ibv_init_attr.qp_context = rqpair; + rqpair->ibv_init_attr.qp_type = IBV_QPT_RC; + rqpair->ibv_init_attr.send_cq = rqpair->poller->cq; + rqpair->ibv_init_attr.recv_cq = rqpair->poller->cq; + rqpair->ibv_init_attr.cap.max_send_wr = rqpair->max_queue_depth * + 2; /* SEND, READ, and WRITE operations */ + rqpair->ibv_init_attr.cap.max_recv_wr = rqpair->max_queue_depth; /* RECV operations */ + rqpair->ibv_init_attr.cap.max_send_sge = rqpair->max_sge; + rqpair->ibv_init_attr.cap.max_recv_sge = NVMF_DEFAULT_RX_SGE; + + rc = rdma_create_qp(rqpair->cm_id, rqpair->port->device->pd, &rqpair->ibv_init_attr); + if (rc) { + SPDK_ERRLOG("rdma_create_qp failed: errno %d: %s\n", errno, spdk_strerror(errno)); + rdma_destroy_id(rqpair->cm_id); + rqpair->cm_id = NULL; + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + spdk_trace_record(TRACE_RDMA_QP_CREATE, 0, 0, (uintptr_t)rqpair->cm_id, 0); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "New RDMA Connection: %p\n", qpair); + + rqpair->reqs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->reqs)); + rqpair->recvs = calloc(rqpair->max_queue_depth, sizeof(*rqpair->recvs)); + rqpair->cmds = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cmds), + 0x1000, NULL); + rqpair->cpls = spdk_dma_zmalloc(rqpair->max_queue_depth * sizeof(*rqpair->cpls), + 0x1000, NULL); + + + if (transport->opts.in_capsule_data_size > 0) { + rqpair->bufs = spdk_dma_zmalloc(rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, + 0x1000, NULL); + } + + if (!rqpair->reqs || !rqpair->recvs || !rqpair->cmds || + !rqpair->cpls || (transport->opts.in_capsule_data_size && !rqpair->bufs)) { + SPDK_ERRLOG("Unable to allocate sufficient memory for RDMA queue.\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + rqpair->cmds_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cmds, + rqpair->max_queue_depth * sizeof(*rqpair->cmds), + IBV_ACCESS_LOCAL_WRITE); + rqpair->cpls_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->cpls, + rqpair->max_queue_depth * sizeof(*rqpair->cpls), + 0); + + if (transport->opts.in_capsule_data_size) { + rqpair->bufs_mr = ibv_reg_mr(rqpair->cm_id->pd, rqpair->bufs, + rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, + IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); + } + + if (!rqpair->cmds_mr || !rqpair->cpls_mr || (transport->opts.in_capsule_data_size && + !rqpair->bufs_mr)) { + SPDK_ERRLOG("Unable to register required memory for RDMA queue.\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Command Array: %p Length: %lx LKey: %x\n", + rqpair->cmds, rqpair->max_queue_depth * sizeof(*rqpair->cmds), rqpair->cmds_mr->lkey); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Completion Array: %p Length: %lx LKey: %x\n", + rqpair->cpls, rqpair->max_queue_depth * sizeof(*rqpair->cpls), rqpair->cpls_mr->lkey); + if (rqpair->bufs && rqpair->bufs_mr) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "In Capsule Data Array: %p Length: %x LKey: %x\n", + rqpair->bufs, rqpair->max_queue_depth * + transport->opts.in_capsule_data_size, rqpair->bufs_mr->lkey); + } + + /* Initialise request state queues and counters of the queue pair */ + for (i = RDMA_REQUEST_STATE_FREE; i < RDMA_REQUEST_NUM_STATES; i++) { + TAILQ_INIT(&rqpair->state_queue[i]); + rqpair->state_cntr[i] = 0; + } + + for (i = 0; i < rqpair->max_queue_depth; i++) { + struct ibv_recv_wr *bad_wr = NULL; + + rdma_recv = &rqpair->recvs[i]; + rdma_recv->qpair = rqpair; + + /* Set up memory to receive commands */ + if (rqpair->bufs) { + rdma_recv->buf = (void *)((uintptr_t)rqpair->bufs + (i * + transport->opts.in_capsule_data_size)); + } + + rdma_recv->sgl[0].addr = (uintptr_t)&rqpair->cmds[i]; + rdma_recv->sgl[0].length = sizeof(rqpair->cmds[i]); + rdma_recv->sgl[0].lkey = rqpair->cmds_mr->lkey; + rdma_recv->wr.num_sge = 1; + + if (rdma_recv->buf && rqpair->bufs_mr) { + rdma_recv->sgl[1].addr = (uintptr_t)rdma_recv->buf; + rdma_recv->sgl[1].length = transport->opts.in_capsule_data_size; + rdma_recv->sgl[1].lkey = rqpair->bufs_mr->lkey; + rdma_recv->wr.num_sge++; + } + + rdma_recv->wr.wr_id = (uintptr_t)rdma_recv; + rdma_recv->wr.sg_list = rdma_recv->sgl; + + rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_recv->wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Unable to post capsule for RDMA RECV\n"); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + } + + for (i = 0; i < rqpair->max_queue_depth; i++) { + rdma_req = &rqpair->reqs[i]; + + rdma_req->req.qpair = &rqpair->qpair; + rdma_req->req.cmd = NULL; + + /* Set up memory to send responses */ + rdma_req->req.rsp = &rqpair->cpls[i]; + + rdma_req->rsp.sgl[0].addr = (uintptr_t)&rqpair->cpls[i]; + rdma_req->rsp.sgl[0].length = sizeof(rqpair->cpls[i]); + rdma_req->rsp.sgl[0].lkey = rqpair->cpls_mr->lkey; + + rdma_req->rsp.wr.wr_id = (uintptr_t)rdma_req; + rdma_req->rsp.wr.next = NULL; + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->rsp.wr.sg_list = rdma_req->rsp.sgl; + rdma_req->rsp.wr.num_sge = SPDK_COUNTOF(rdma_req->rsp.sgl); + + /* Set up memory for data buffers */ + rdma_req->data.wr.wr_id = (uint64_t)rdma_req; + rdma_req->data.wr.next = NULL; + rdma_req->data.wr.send_flags = IBV_SEND_SIGNALED; + rdma_req->data.wr.sg_list = rdma_req->data.sgl; + rdma_req->data.wr.num_sge = SPDK_COUNTOF(rdma_req->data.sgl); + + /* Initialize request state to FREE */ + rdma_req->state = RDMA_REQUEST_STATE_FREE; + TAILQ_INSERT_TAIL(&rqpair->state_queue[rdma_req->state], rdma_req, state_link); + rqpair->state_cntr[rdma_req->state]++; + } + + return 0; +} + +static int +request_transfer_in(struct spdk_nvmf_request *req) +{ + int rc; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_send_wr *bad_wr = NULL; + + qpair = req->qpair; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(req->xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER); + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA READ POSTED. Request: %p Connection: %p\n", req, qpair); + + rdma_req->data.wr.opcode = IBV_WR_RDMA_READ; + rdma_req->data.wr.next = NULL; + rc = ibv_post_send(rqpair->cm_id->qp, &rdma_req->data.wr, &bad_wr); + if (rc) { + SPDK_ERRLOG("Unable to transfer data from host to target\n"); + return -1; + } + return 0; +} + +static int +request_transfer_out(struct spdk_nvmf_request *req, int *data_posted) +{ + int rc; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvme_cpl *rsp; + struct ibv_recv_wr *bad_recv_wr = NULL; + struct ibv_send_wr *send_wr, *bad_send_wr = NULL; + + *data_posted = 0; + qpair = req->qpair; + rsp = &req->rsp->nvme_cpl; + rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* Advance our sq_head pointer */ + if (qpair->sq_head == qpair->sq_head_max) { + qpair->sq_head = 0; + } else { + qpair->sq_head++; + } + rsp->sqhd = qpair->sq_head; + + /* Post the capsule to the recv buffer */ + assert(rdma_req->recv != NULL); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA RECV POSTED. Recv: %p Connection: %p\n", rdma_req->recv, + rqpair); + rc = ibv_post_recv(rqpair->cm_id->qp, &rdma_req->recv->wr, &bad_recv_wr); + if (rc) { + SPDK_ERRLOG("Unable to re-post rx descriptor\n"); + return rc; + } + rdma_req->recv = NULL; + + /* Build the response which consists of an optional + * RDMA WRITE to transfer data, plus an RDMA SEND + * containing the response. + */ + send_wr = &rdma_req->rsp.wr; + + if (rsp->status.sc == SPDK_NVME_SC_SUCCESS && + req->xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA WRITE POSTED. Request: %p Connection: %p\n", req, qpair); + + rdma_req->data.wr.opcode = IBV_WR_RDMA_WRITE; + + rdma_req->data.wr.next = send_wr; + *data_posted = 1; + send_wr = &rdma_req->data.wr; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "RDMA SEND POSTED. Request: %p Connection: %p\n", req, qpair); + + /* Send the completion */ + rc = ibv_post_send(rqpair->cm_id->qp, send_wr, &bad_send_wr); + if (rc) { + SPDK_ERRLOG("Unable to send response capsule\n"); + } + + return rc; +} + +static int +spdk_nvmf_rdma_event_accept(struct rdma_cm_id *id, struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_accept_private_data accept_data; + struct rdma_conn_param ctrlr_event_data = {}; + int rc; + + accept_data.recfmt = 0; + accept_data.crqsize = rqpair->max_queue_depth; + + ctrlr_event_data.private_data = &accept_data; + ctrlr_event_data.private_data_len = sizeof(accept_data); + if (id->ps == RDMA_PS_TCP) { + ctrlr_event_data.responder_resources = 0; /* We accept 0 reads from the host */ + ctrlr_event_data.initiator_depth = rqpair->max_rw_depth; + } + + rc = rdma_accept(id, &ctrlr_event_data); + if (rc) { + SPDK_ERRLOG("Error %d on rdma_accept\n", errno); + } else { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Sent back the accept\n"); + } + + return rc; +} + +static void +spdk_nvmf_rdma_event_reject(struct rdma_cm_id *id, enum spdk_nvmf_rdma_transport_error error) +{ + struct spdk_nvmf_rdma_reject_private_data rej_data; + + rej_data.recfmt = 0; + rej_data.sts = error; + + rdma_reject(id, &rej_data, sizeof(rej_data)); +} + +static int +nvmf_rdma_connect(struct spdk_nvmf_transport *transport, struct rdma_cm_event *event, + new_qpair_fn cb_fn) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_qpair *rqpair = NULL; + struct spdk_nvmf_rdma_port *port; + struct rdma_conn_param *rdma_param = NULL; + const struct spdk_nvmf_rdma_request_private_data *private_data = NULL; + uint16_t max_queue_depth; + uint16_t max_rw_depth; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + assert(event->id != NULL); /* Impossible. Can't even reject the connection. */ + assert(event->id->verbs != NULL); /* Impossible. No way to handle this. */ + + rdma_param = &event->param.conn; + if (rdma_param->private_data == NULL || + rdma_param->private_data_len < sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_ERRLOG("connect request: no private data provided\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_PRIVATE_DATA_LENGTH); + return -1; + } + + private_data = rdma_param->private_data; + if (private_data->recfmt != 0) { + SPDK_ERRLOG("Received RDMA private data with RECFMT != 0\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_INVALID_RECFMT); + return -1; + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Connect Recv on fabric intf name %s, dev_name %s\n", + event->id->verbs->device->name, event->id->verbs->device->dev_name); + + port = event->listen_id->context; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Listen Id was %p with verbs %p. ListenAddr: %p\n", + event->listen_id, event->listen_id->verbs, port); + + /* Figure out the supported queue depth. This is a multi-step process + * that takes into account hardware maximums, host provided values, + * and our target's internal memory limits */ + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Calculating Queue Depth\n"); + + /* Start with the maximum queue depth allowed by the target */ + max_queue_depth = rtransport->transport.opts.max_queue_depth; + max_rw_depth = rtransport->transport.opts.max_queue_depth; + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Target Max Queue Depth: %d\n", + rtransport->transport.opts.max_queue_depth); + + /* Next check the local NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Local NIC Max Send/Recv Queue Depth: %d Max Read/Write Queue Depth: %d\n", + port->device->attr.max_qp_wr, port->device->attr.max_qp_rd_atom); + max_queue_depth = spdk_min(max_queue_depth, port->device->attr.max_qp_wr); + max_rw_depth = spdk_min(max_rw_depth, port->device->attr.max_qp_rd_atom); + + /* Next check the remote NIC's hardware limitations */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, + "Host (Initiator) NIC Max Incoming RDMA R/W operations: %d Max Outgoing RDMA R/W operations: %d\n", + rdma_param->initiator_depth, rdma_param->responder_resources); + if (rdma_param->initiator_depth > 0) { + max_rw_depth = spdk_min(max_rw_depth, rdma_param->initiator_depth); + } + + /* Finally check for the host software requested values, which are + * optional. */ + if (rdma_param->private_data != NULL && + rdma_param->private_data_len >= sizeof(struct spdk_nvmf_rdma_request_private_data)) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Receive Queue Size: %d\n", private_data->hrqsize); + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Host Send Queue Size: %d\n", private_data->hsqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hrqsize); + max_queue_depth = spdk_min(max_queue_depth, private_data->hsqsize + 1); + } + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Final Negotiated Queue Depth: %d R/W Depth: %d\n", + max_queue_depth, max_rw_depth); + + rqpair = calloc(1, sizeof(struct spdk_nvmf_rdma_qpair)); + if (rqpair == NULL) { + SPDK_ERRLOG("Could not allocate new connection.\n"); + spdk_nvmf_rdma_event_reject(event->id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + return -1; + } + + rqpair->port = port; + rqpair->max_queue_depth = max_queue_depth; + rqpair->max_rw_depth = max_rw_depth; + rqpair->cm_id = event->id; + rqpair->listen_id = event->listen_id; + rqpair->qpair.transport = transport; + rqpair->max_sge = spdk_min(port->device->attr.max_sge, SPDK_NVMF_MAX_SGL_ENTRIES); + TAILQ_INIT(&rqpair->incoming_queue); + event->id->context = &rqpair->qpair; + + cb_fn(&rqpair->qpair); + + return 0; +} + +static void +_nvmf_rdma_disconnect(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + spdk_nvmf_qpair_disconnect(qpair, NULL, NULL); +} + +static void +_nvmf_rdma_disconnect_retry(void *ctx) +{ + struct spdk_nvmf_qpair *qpair = ctx; + struct spdk_nvmf_poll_group *group; + + /* Read the group out of the qpair. This is normally set and accessed only from + * the thread that created the group. Here, we're not on that thread necessarily. + * The data member qpair->group begins it's life as NULL and then is assigned to + * a pointer and never changes. So fortunately reading this and checking for + * non-NULL is thread safe in the x86_64 memory model. */ + group = qpair->group; + + if (group == NULL) { + /* The qpair hasn't been assigned to a group yet, so we can't + * process a disconnect. Send a message to ourself and try again. */ + spdk_thread_send_msg(spdk_get_thread(), _nvmf_rdma_disconnect_retry, qpair); + return; + } + + spdk_thread_send_msg(group->thread, _nvmf_rdma_disconnect, qpair); +} + +static int +nvmf_rdma_disconnect(struct rdma_cm_event *evt) +{ + struct spdk_nvmf_qpair *qpair; + struct spdk_nvmf_rdma_qpair *rqpair; + + if (evt->id == NULL) { + SPDK_ERRLOG("disconnect request: missing cm_id\n"); + return -1; + } + + qpair = evt->id->context; + if (qpair == NULL) { + SPDK_ERRLOG("disconnect request: no active connection\n"); + return -1; + } + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_trace_record(TRACE_RDMA_QP_DISCONNECT, 0, 0, (uintptr_t)rqpair->cm_id, 0); + + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + + _nvmf_rdma_disconnect_retry(qpair); + + return 0; +} + +#ifdef DEBUG +static const char *CM_EVENT_STR[] = { + "RDMA_CM_EVENT_ADDR_RESOLVED", + "RDMA_CM_EVENT_ADDR_ERROR", + "RDMA_CM_EVENT_ROUTE_RESOLVED", + "RDMA_CM_EVENT_ROUTE_ERROR", + "RDMA_CM_EVENT_CONNECT_REQUEST", + "RDMA_CM_EVENT_CONNECT_RESPONSE", + "RDMA_CM_EVENT_CONNECT_ERROR", + "RDMA_CM_EVENT_UNREACHABLE", + "RDMA_CM_EVENT_REJECTED", + "RDMA_CM_EVENT_ESTABLISHED", + "RDMA_CM_EVENT_DISCONNECTED", + "RDMA_CM_EVENT_DEVICE_REMOVAL", + "RDMA_CM_EVENT_MULTICAST_JOIN", + "RDMA_CM_EVENT_MULTICAST_ERROR", + "RDMA_CM_EVENT_ADDR_CHANGE", + "RDMA_CM_EVENT_TIMEWAIT_EXIT" +}; +#endif /* DEBUG */ + +static void +spdk_nvmf_process_cm_event(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct rdma_cm_event *event; + int rc; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + if (rtransport->event_channel == NULL) { + return; + } + + while (1) { + rc = rdma_get_cm_event(rtransport->event_channel, &event); + if (rc == 0) { + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Acceptor Event: %s\n", CM_EVENT_STR[event->event]); + + spdk_trace_record(TRACE_RDMA_CM_ASYNC_EVENT, 0, 0, 0, event->event); + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + case RDMA_CM_EVENT_ROUTE_ERROR: + /* No action required. The target never attempts to resolve routes. */ + break; + case RDMA_CM_EVENT_CONNECT_REQUEST: + rc = nvmf_rdma_connect(transport, event, cb_fn); + if (rc < 0) { + SPDK_ERRLOG("Unable to process connect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_CONNECT_RESPONSE: + /* The target never initiates a new connection. So this will not occur. */ + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + /* Can this happen? The docs say it can, but not sure what causes it. */ + break; + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_REJECTED: + /* These only occur on the client side. */ + break; + case RDMA_CM_EVENT_ESTABLISHED: + /* TODO: Should we be waiting for this event anywhere? */ + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + rc = nvmf_rdma_disconnect(event); + if (rc < 0) { + SPDK_ERRLOG("Unable to process disconnect event. rc: %d\n", rc); + break; + } + break; + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + /* Multicast is not used */ + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + /* Not utilizing this event */ + break; + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* For now, do nothing. The target never re-uses queue pairs. */ + break; + default: + SPDK_ERRLOG("Unexpected Acceptor Event [%d]\n", event->event); + break; + } + + rdma_ack_cm_event(event); + } else { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + SPDK_ERRLOG("Acceptor Event Error: %s\n", spdk_strerror(errno)); + } + break; + } + } +} + +static int +spdk_nvmf_rdma_mem_notify(void *cb_ctx, struct spdk_mem_map *map, + enum spdk_mem_map_notify_action action, + void *vaddr, size_t size) +{ + struct spdk_nvmf_rdma_device *device = cb_ctx; + struct ibv_pd *pd = device->pd; + struct ibv_mr *mr; + + switch (action) { + case SPDK_MEM_MAP_NOTIFY_REGISTER: + mr = ibv_reg_mr(pd, vaddr, size, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_READ | + IBV_ACCESS_REMOTE_WRITE); + if (mr == NULL) { + SPDK_ERRLOG("ibv_reg_mr() failed\n"); + return -1; + } else { + spdk_mem_map_set_translation(map, (uint64_t)vaddr, size, (uint64_t)mr); + } + break; + case SPDK_MEM_MAP_NOTIFY_UNREGISTER: + mr = (struct ibv_mr *)spdk_mem_map_translate(map, (uint64_t)vaddr, NULL); + spdk_mem_map_clear_translation(map, (uint64_t)vaddr, size); + if (mr) { + ibv_dereg_mr(mr); + } + break; + } + + return 0; +} + +typedef enum spdk_nvme_data_transfer spdk_nvme_data_transfer_t; + +static spdk_nvme_data_transfer_t +spdk_nvmf_rdma_request_get_xfer(struct spdk_nvmf_rdma_request *rdma_req) +{ + enum spdk_nvme_data_transfer xfer; + struct spdk_nvme_cmd *cmd = &rdma_req->req.cmd->nvme_cmd; + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + rdma_req->rsp.wr.opcode = IBV_WR_SEND; + rdma_req->rsp.wr.imm_data = 0; +#endif + + /* Figure out data transfer direction */ + if (cmd->opc == SPDK_NVME_OPC_FABRIC) { + xfer = spdk_nvme_opc_get_data_transfer(rdma_req->req.cmd->nvmf_cmd.fctype); + } else { + xfer = spdk_nvme_opc_get_data_transfer(cmd->opc); + + /* Some admin commands are special cases */ + if ((rdma_req->req.qpair->qid == 0) && + ((cmd->opc == SPDK_NVME_OPC_GET_FEATURES) || + (cmd->opc == SPDK_NVME_OPC_SET_FEATURES))) { + switch (cmd->cdw10 & 0xff) { + case SPDK_NVME_FEAT_LBA_RANGE_TYPE: + case SPDK_NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION: + case SPDK_NVME_FEAT_HOST_IDENTIFIER: + break; + default: + xfer = SPDK_NVME_DATA_NONE; + } + } + } + + if (xfer == SPDK_NVME_DATA_NONE) { + return xfer; + } + + /* Even for commands that may transfer data, they could have specified 0 length. + * We want those to show up with xfer SPDK_NVME_DATA_NONE. + */ + switch (sgl->generic.type) { + case SPDK_NVME_SGL_TYPE_DATA_BLOCK: + case SPDK_NVME_SGL_TYPE_BIT_BUCKET: + case SPDK_NVME_SGL_TYPE_SEGMENT: + case SPDK_NVME_SGL_TYPE_LAST_SEGMENT: + case SPDK_NVME_SGL_TYPE_TRANSPORT_DATA_BLOCK: + if (sgl->unkeyed.length == 0) { + xfer = SPDK_NVME_DATA_NONE; + } + break; + case SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK: + if (sgl->keyed.length == 0) { + xfer = SPDK_NVME_DATA_NONE; + } + break; + } + + return xfer; +} + +static int +spdk_nvmf_rdma_request_fill_iovs(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + void *buf = NULL; + uint32_t length = rdma_req->req.length; + uint32_t i = 0; + + rdma_req->req.iovcnt = 0; + while (length) { + buf = spdk_mempool_get(rtransport->data_buf_pool); + if (!buf) { + goto nomem; + } + + rdma_req->req.iov[i].iov_base = (void *)((uintptr_t)(buf + NVMF_DATA_BUFFER_MASK) & + ~NVMF_DATA_BUFFER_MASK); + rdma_req->req.iov[i].iov_len = spdk_min(length, rtransport->transport.opts.io_unit_size); + rdma_req->req.iovcnt++; + rdma_req->data.buffers[i] = buf; + rdma_req->data.wr.sg_list[i].addr = (uintptr_t)(rdma_req->req.iov[i].iov_base); + rdma_req->data.wr.sg_list[i].length = rdma_req->req.iov[i].iov_len; + rdma_req->data.wr.sg_list[i].lkey = ((struct ibv_mr *)spdk_mem_map_translate(device->map, + (uint64_t)buf, NULL))->lkey; + + length -= rdma_req->req.iov[i].iov_len; + i++; + } + + rdma_req->data_from_pool = true; + + return 0; + +nomem: + while (i) { + i--; + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->req.iov[i].iov_base); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->req.iov[i].iov_len = 0; + + rdma_req->data.wr.sg_list[i].addr = 0; + rdma_req->data.wr.sg_list[i].length = 0; + rdma_req->data.wr.sg_list[i].lkey = 0; + } + rdma_req->req.iovcnt = 0; + return -ENOMEM; +} + +static int +spdk_nvmf_rdma_request_parse_sgl(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_device *device, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvme_cmd *cmd; + struct spdk_nvme_cpl *rsp; + struct spdk_nvme_sgl_descriptor *sgl; + + cmd = &rdma_req->req.cmd->nvme_cmd; + rsp = &rdma_req->req.rsp->nvme_cpl; + sgl = &cmd->dptr.sgl1; + + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK && + (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_ADDRESS || + sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY)) { + if (sgl->keyed.length > rtransport->transport.opts.max_io_size) { + SPDK_ERRLOG("SGL length 0x%x exceeds max io size 0x%x\n", + sgl->keyed.length, rtransport->transport.opts.max_io_size); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) != 0) { + if (sgl->keyed.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY) { + rdma_req->rsp.wr.opcode = IBV_WR_SEND_WITH_INV; + rdma_req->rsp.wr.imm_data = sgl->keyed.key; + } + } +#endif + + /* fill request length and populate iovs */ + rdma_req->req.length = sgl->keyed.length; + + if (spdk_nvmf_rdma_request_fill_iovs(rtransport, device, rdma_req) < 0) { + /* No available buffers. Queue this request up. */ + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "No available large data buffers. Queueing request %p\n", rdma_req); + return 0; + } + + /* backward compatible */ + rdma_req->req.data = rdma_req->req.iov[0].iov_base; + + /* rdma wr specifics */ + rdma_req->data.wr.num_sge = rdma_req->req.iovcnt; + rdma_req->data.wr.wr.rdma.rkey = sgl->keyed.key; + rdma_req->data.wr.wr.rdma.remote_addr = sgl->address; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p took %d buffer/s from central pool\n", rdma_req, + rdma_req->req.iovcnt); + + return 0; + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK && + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET) { + uint64_t offset = sgl->address; + uint32_t max_len = rtransport->transport.opts.in_capsule_data_size; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "In-capsule data: offset 0x%" PRIx64 ", length 0x%x\n", + offset, sgl->unkeyed.length); + + if (offset > max_len) { + SPDK_ERRLOG("In-capsule offset 0x%" PRIx64 " exceeds capsule length 0x%x\n", + offset, max_len); + rsp->status.sc = SPDK_NVME_SC_INVALID_SGL_OFFSET; + return -1; + } + max_len -= (uint32_t)offset; + + if (sgl->unkeyed.length > max_len) { + SPDK_ERRLOG("In-capsule data length 0x%x exceeds capsule length 0x%x\n", + sgl->unkeyed.length, max_len); + rsp->status.sc = SPDK_NVME_SC_DATA_SGL_LENGTH_INVALID; + return -1; + } + + rdma_req->req.data = rdma_req->recv->buf + offset; + rdma_req->data_from_pool = false; + rdma_req->req.length = sgl->unkeyed.length; + + rdma_req->req.iov[0].iov_base = rdma_req->req.data; + rdma_req->req.iov[0].iov_len = rdma_req->req.length; + rdma_req->req.iovcnt = 1; + + return 0; + } + + SPDK_ERRLOG("Invalid NVMf I/O Command SGL: Type 0x%x, Subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + rsp->status.sc = SPDK_NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID; + return -1; +} + +static bool +spdk_nvmf_rdma_request_process(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_request *rdma_req) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvme_cpl *rsp = &rdma_req->req.rsp->nvme_cpl; + int rc; + struct spdk_nvmf_rdma_recv *rdma_recv; + enum spdk_nvmf_rdma_request_state prev_state; + bool progress = false; + int data_posted; + int cur_rdma_rw_depth; + + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + device = rqpair->port->device; + + assert(rdma_req->state != RDMA_REQUEST_STATE_FREE); + + /* If the queue pair is in an error state, force the request to the completed state + * to release resources. */ + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR || rqpair->qpair.state != SPDK_NVMF_QPAIR_ACTIVE) { + if (rdma_req->state == RDMA_REQUEST_STATE_NEED_BUFFER) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + } + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } + + /* The loop here is to allow for several back-to-back state changes. */ + do { + prev_state = rdma_req->state; + + SPDK_DEBUGLOG(SPDK_LOG_RDMA, "Request %p entering state %d\n", rdma_req, prev_state); + + switch (rdma_req->state) { + case RDMA_REQUEST_STATE_FREE: + /* Some external code must kick a request into RDMA_REQUEST_STATE_NEW + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_NEW: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEW, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rdma_recv = rdma_req->recv; + + /* The first element of the SGL is the NVMe command */ + rdma_req->req.cmd = (union nvmf_h2c_msg *)rdma_recv->sgl[0].addr; + memset(rdma_req->req.rsp, 0, sizeof(*rdma_req->req.rsp)); + + TAILQ_REMOVE(&rqpair->incoming_queue, rdma_recv, link); + + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + break; + } + + /* The next state transition depends on the data transfer needs of this request. */ + rdma_req->req.xfer = spdk_nvmf_rdma_request_get_xfer(rdma_req); + + /* If no data to transfer, ready to execute. */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_NONE) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + break; + } + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEED_BUFFER); + TAILQ_INSERT_TAIL(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + break; + case RDMA_REQUEST_STATE_NEED_BUFFER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_NEED_BUFFER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + assert(rdma_req->req.xfer != SPDK_NVME_DATA_NONE); + + if (rdma_req != TAILQ_FIRST(&rqpair->ch->pending_data_buf_queue)) { + /* This request needs to wait in line to obtain a buffer */ + break; + } + + /* Try to get a data buffer */ + rc = spdk_nvmf_rdma_request_parse_sgl(rtransport, device, rdma_req); + if (rc < 0) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + break; + } + + if (!rdma_req->req.data) { + /* No buffers available. */ + break; + } + + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + + /* If data is transferring from host to controller and the data didn't + * arrive using in capsule data, we need to do a transfer from the host. + */ + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER && rdma_req->data_from_pool) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + break; + } + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + break; + case RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req != TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING])) { + /* This request needs to wait in line to perform RDMA */ + break; + } + cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); + + if (cur_rdma_rw_depth >= rqpair->max_rw_depth) { + /* R/W queue is full, need to wait */ + break; + } + + if (rdma_req->req.xfer == SPDK_NVME_DATA_HOST_TO_CONTROLLER) { + rc = request_transfer_in(&rdma_req->req); + if (!rc) { + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + } else { + rsp->status.sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } + } else if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + /* The data transfer will be kicked off from + * RDMA_REQUEST_STATE_READY_TO_COMPLETE state. + */ + spdk_nvmf_rdma_request_set_state(rdma_req, + RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } else { + SPDK_ERRLOG("Cannot perform data transfer, unknown state: %u\n", + rdma_req->req.xfer); + assert(0); + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_READY_TO_EXECUTE + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_READY_TO_EXECUTE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_EXECUTE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTING); + spdk_nvmf_request_exec(&rdma_req->req); + break; + case RDMA_REQUEST_STATE_EXECUTING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_EXECUTED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_EXECUTED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_EXECUTED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + if (rdma_req->req.xfer == SPDK_NVME_DATA_CONTROLLER_TO_HOST) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + } else { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + } + break; + case RDMA_REQUEST_STATE_READY_TO_COMPLETE: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_READY_TO_COMPLETE, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + rc = request_transfer_out(&rdma_req->req, &data_posted); + assert(rc == 0); /* No good way to handle this currently */ + if (rc) { + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } else { + spdk_nvmf_rdma_request_set_state(rdma_req, + data_posted ? + RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST : + RDMA_REQUEST_STATE_COMPLETING); + } + break; + case RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETING: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETING, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + /* Some external code must kick a request into RDMA_REQUEST_STATE_COMPLETED + * to escape this state. */ + break; + case RDMA_REQUEST_STATE_COMPLETED: + spdk_trace_record(TRACE_RDMA_REQUEST_STATE_COMPLETED, 0, 0, + (uintptr_t)rdma_req, (uintptr_t)rqpair->cm_id); + + if (rdma_req->data_from_pool) { + /* Put the buffer/s back in the pool */ + for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->data.buffers[i] = NULL; + } + rdma_req->data_from_pool = false; + } + rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; + rdma_req->req.data = NULL; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); + break; + case RDMA_REQUEST_NUM_STATES: + default: + assert(0); + break; + } + + if (rdma_req->state != prev_state) { + progress = true; + } + } while (rdma_req->state != prev_state); + + return progress; +} + +/* Public API callbacks begin here */ + +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH 128 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR 64 +#define SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE 4096 +#define SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE 131072 +#define SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE 131072 + +static void +spdk_nvmf_rdma_opts_init(struct spdk_nvmf_transport_opts *opts) +{ + opts->max_queue_depth = SPDK_NVMF_RDMA_DEFAULT_MAX_QUEUE_DEPTH; + opts->max_qpairs_per_ctrlr = SPDK_NVMF_RDMA_DEFAULT_MAX_QPAIRS_PER_CTRLR; + opts->in_capsule_data_size = SPDK_NVMF_RDMA_DEFAULT_IN_CAPSULE_DATA_SIZE; + opts->max_io_size = SPDK_NVMF_RDMA_DEFAULT_MAX_IO_SIZE; + opts->io_unit_size = SPDK_NVMF_RDMA_DEFAULT_IO_BUFFER_SIZE; + opts->max_aq_depth = SPDK_NVMF_RDMA_DEFAULT_AQ_DEPTH; +} + +static int spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport); + +static struct spdk_nvmf_transport * +spdk_nvmf_rdma_create(struct spdk_nvmf_transport_opts *opts) +{ + int rc; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + struct ibv_context **contexts; + uint32_t i; + int flag; + uint32_t sge_count; + + const struct spdk_mem_map_ops nvmf_rdma_map_ops = { + .notify_cb = spdk_nvmf_rdma_mem_notify, + .are_contiguous = NULL + }; + + rtransport = calloc(1, sizeof(*rtransport)); + if (!rtransport) { + return NULL; + } + + if (pthread_mutex_init(&rtransport->lock, NULL)) { + SPDK_ERRLOG("pthread_mutex_init() failed\n"); + free(rtransport); + return NULL; + } + + spdk_io_device_register(rtransport, spdk_nvmf_rdma_mgmt_channel_create, + spdk_nvmf_rdma_mgmt_channel_destroy, + sizeof(struct spdk_nvmf_rdma_mgmt_channel), + "rdma_transport"); + + TAILQ_INIT(&rtransport->devices); + TAILQ_INIT(&rtransport->ports); + + rtransport->transport.ops = &spdk_nvmf_transport_rdma; + + SPDK_INFOLOG(SPDK_LOG_RDMA, "*** RDMA Transport Init ***\n" + " Transport opts: max_ioq_depth=%d, max_io_size=%d,\n" + " max_qpairs_per_ctrlr=%d, io_unit_size=%d,\n" + " in_capsule_data_size=%d, max_aq_depth=%d\n", + opts->max_queue_depth, + opts->max_io_size, + opts->max_qpairs_per_ctrlr, + opts->io_unit_size, + opts->in_capsule_data_size, + opts->max_aq_depth); + + /* I/O unit size cannot be larger than max I/O size */ + if (opts->io_unit_size > opts->max_io_size) { + opts->io_unit_size = opts->max_io_size; + } + + sge_count = opts->max_io_size / opts->io_unit_size; + if (sge_count > SPDK_NVMF_MAX_SGL_ENTRIES) { + SPDK_ERRLOG("Unsupported IO Unit size specified, %d bytes\n", opts->io_unit_size); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->event_channel = rdma_create_event_channel(); + if (rtransport->event_channel == NULL) { + SPDK_ERRLOG("rdma_create_event_channel() failed, %s\n", spdk_strerror(errno)); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + flag = fcntl(rtransport->event_channel->fd, F_GETFL); + if (fcntl(rtransport->event_channel->fd, F_SETFL, flag | O_NONBLOCK) < 0) { + SPDK_ERRLOG("fcntl can't set nonblocking mode for socket, fd: %d (%s)\n", + rtransport->event_channel->fd, spdk_strerror(errno)); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->data_buf_pool = spdk_mempool_create("spdk_nvmf_rdma", + opts->max_queue_depth * 4, /* The 4 is arbitrarily chosen. Needs to be configurable. */ + opts->max_io_size + NVMF_DATA_BUFFER_ALIGNMENT, + SPDK_MEMPOOL_DEFAULT_CACHE_SIZE, + SPDK_ENV_SOCKET_ID_ANY); + if (!rtransport->data_buf_pool) { + SPDK_ERRLOG("Unable to allocate buffer pool for poll group\n"); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + contexts = rdma_get_devices(NULL); + if (contexts == NULL) { + SPDK_ERRLOG("rdma_get_devices() failed: %s (%d)\n", spdk_strerror(errno), errno); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + i = 0; + rc = 0; + while (contexts[i] != NULL) { + device = calloc(1, sizeof(*device)); + if (!device) { + SPDK_ERRLOG("Unable to allocate memory for RDMA devices.\n"); + rc = -ENOMEM; + break; + } + device->context = contexts[i]; + rc = ibv_query_device(device->context, &device->attr); + if (rc < 0) { + SPDK_ERRLOG("Failed to query RDMA device attributes.\n"); + free(device); + break; + + } + +#ifdef SPDK_CONFIG_RDMA_SEND_WITH_INVAL + if ((device->attr.device_cap_flags & IBV_DEVICE_MEM_MGT_EXTENSIONS) == 0) { + SPDK_WARNLOG("The libibverbs on this system supports SEND_WITH_INVALIDATE,"); + SPDK_WARNLOG("but the device with vendor ID %u does not.\n", device->attr.vendor_id); + } + + /** + * The vendor ID is assigned by the IEEE and an ID of 0 implies Soft-RoCE. + * The Soft-RoCE RXE driver does not currently support send with invalidate, + * but incorrectly reports that it does. There are changes making their way + * through the kernel now that will enable this feature. When they are merged, + * we can conditionally enable this feature. + * + * TODO: enable this for versions of the kernel rxe driver that support it. + */ + if (device->attr.vendor_id == 0) { + device->attr.device_cap_flags &= ~(IBV_DEVICE_MEM_MGT_EXTENSIONS); + } +#endif + + /* set up device context async ev fd as NON_BLOCKING */ + flag = fcntl(device->context->async_fd, F_GETFL); + rc = fcntl(device->context->async_fd, F_SETFL, flag | O_NONBLOCK); + if (rc < 0) { + SPDK_ERRLOG("Failed to set context async fd to NONBLOCK.\n"); + free(device); + break; + } + + device->pd = ibv_alloc_pd(device->context); + if (!device->pd) { + SPDK_ERRLOG("Unable to allocate protection domain.\n"); + free(device); + rc = -1; + break; + } + + device->map = spdk_mem_map_alloc(0, &nvmf_rdma_map_ops, device); + if (!device->map) { + SPDK_ERRLOG("Unable to allocate memory map for new poll group\n"); + ibv_dealloc_pd(device->pd); + free(device); + rc = -1; + break; + } + + TAILQ_INSERT_TAIL(&rtransport->devices, device, link); + i++; + } + rdma_free_devices(contexts); + + if (rc < 0) { + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + /* Set up poll descriptor array to monitor events from RDMA and IB + * in a single poll syscall + */ + rtransport->npoll_fds = i + 1; + i = 0; + rtransport->poll_fds = calloc(rtransport->npoll_fds, sizeof(struct pollfd)); + if (rtransport->poll_fds == NULL) { + SPDK_ERRLOG("poll_fds allocation failed\n"); + spdk_nvmf_rdma_destroy(&rtransport->transport); + return NULL; + } + + rtransport->poll_fds[i].fd = rtransport->event_channel->fd; + rtransport->poll_fds[i++].events = POLLIN; + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + rtransport->poll_fds[i].fd = device->context->async_fd; + rtransport->poll_fds[i++].events = POLLIN; + } + + return &rtransport->transport; +} + +static int +spdk_nvmf_rdma_destroy(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *port_tmp; + struct spdk_nvmf_rdma_device *device, *device_tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, port_tmp) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + } + + if (rtransport->poll_fds != NULL) { + free(rtransport->poll_fds); + } + + if (rtransport->event_channel != NULL) { + rdma_destroy_event_channel(rtransport->event_channel); + } + + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, device_tmp) { + TAILQ_REMOVE(&rtransport->devices, device, link); + if (device->map) { + spdk_mem_map_free(&device->map); + } + if (device->pd) { + ibv_dealloc_pd(device->pd); + } + free(device); + } + + if (rtransport->data_buf_pool != NULL) { + if (spdk_mempool_count(rtransport->data_buf_pool) != + (transport->opts.max_queue_depth * 4)) { + SPDK_ERRLOG("transport buffer pool count is %zu but should be %u\n", + spdk_mempool_count(rtransport->data_buf_pool), + transport->opts.max_queue_depth * 4); + } + } + + spdk_mempool_free(rtransport->data_buf_pool); + spdk_io_device_unregister(rtransport, NULL); + pthread_mutex_destroy(&rtransport->lock); + free(rtransport); + + return 0; +} + +static int +spdk_nvmf_rdma_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_port *port_tmp, *port; + struct addrinfo *res; + struct addrinfo hints; + int family; + int rc; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + port = calloc(1, sizeof(*port)); + if (!port) { + return -ENOMEM; + } + + /* Selectively copy the trid. Things like NQN don't matter here - that + * mapping is enforced elsewhere. + */ + port->trid.trtype = SPDK_NVME_TRANSPORT_RDMA; + port->trid.adrfam = trid->adrfam; + snprintf(port->trid.traddr, sizeof(port->trid.traddr), "%s", trid->traddr); + snprintf(port->trid.trsvcid, sizeof(port->trid.trsvcid), "%s", trid->trsvcid); + + pthread_mutex_lock(&rtransport->lock); + assert(rtransport->event_channel != NULL); + TAILQ_FOREACH(port_tmp, &rtransport->ports, link) { + if (spdk_nvme_transport_id_compare(&port_tmp->trid, &port->trid) == 0) { + port_tmp->ref++; + free(port); + /* Already listening at this address */ + pthread_mutex_unlock(&rtransport->lock); + return 0; + } + } + + rc = rdma_create_id(rtransport->event_channel, &port->id, port, RDMA_PS_TCP); + if (rc < 0) { + SPDK_ERRLOG("rdma_create_id() failed\n"); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + switch (port->trid.adrfam) { + case SPDK_NVMF_ADRFAM_IPV4: + family = AF_INET; + break; + case SPDK_NVMF_ADRFAM_IPV6: + family = AF_INET6; + break; + default: + SPDK_ERRLOG("Unhandled ADRFAM %d\n", port->trid.adrfam); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_flags = AI_NUMERICSERV; + hints.ai_socktype = SOCK_STREAM; + hints.ai_protocol = 0; + + rc = getaddrinfo(port->trid.traddr, port->trid.trsvcid, &hints, &res); + if (rc) { + SPDK_ERRLOG("getaddrinfo failed: %s (%d)\n", gai_strerror(rc), rc); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + rc = rdma_bind_addr(port->id, res->ai_addr); + freeaddrinfo(res); + + if (rc < 0) { + SPDK_ERRLOG("rdma_bind_addr() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + if (!port->id->verbs) { + SPDK_ERRLOG("ibv_context is null\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -1; + } + + rc = rdma_listen(port->id, 10); /* 10 = backlog */ + if (rc < 0) { + SPDK_ERRLOG("rdma_listen() failed\n"); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return rc; + } + + TAILQ_FOREACH(device, &rtransport->devices, link) { + if (device->context == port->id->verbs) { + port->device = device; + break; + } + } + if (!port->device) { + SPDK_ERRLOG("Accepted a connection with verbs %p, but unable to find a corresponding device.\n", + port->id->verbs); + rdma_destroy_id(port->id); + free(port); + pthread_mutex_unlock(&rtransport->lock); + return -EINVAL; + } + + SPDK_INFOLOG(SPDK_LOG_RDMA, "*** NVMf Target Listening on %s port %d ***\n", + port->trid.traddr, ntohs(rdma_get_src_port(port->id))); + + port->ref = 1; + + TAILQ_INSERT_TAIL(&rtransport->ports, port, link); + pthread_mutex_unlock(&rtransport->lock); + + return 0; +} + +static int +spdk_nvmf_rdma_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *_trid) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_port *port, *tmp; + struct spdk_nvme_transport_id trid = {}; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + /* Selectively copy the trid. Things like NQN don't matter here - that + * mapping is enforced elsewhere. + */ + trid.trtype = SPDK_NVME_TRANSPORT_RDMA; + trid.adrfam = _trid->adrfam; + snprintf(trid.traddr, sizeof(port->trid.traddr), "%s", _trid->traddr); + snprintf(trid.trsvcid, sizeof(port->trid.trsvcid), "%s", _trid->trsvcid); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH_SAFE(port, &rtransport->ports, link, tmp) { + if (spdk_nvme_transport_id_compare(&port->trid, &trid) == 0) { + assert(port->ref > 0); + port->ref--; + if (port->ref == 0) { + TAILQ_REMOVE(&rtransport->ports, port, link); + rdma_destroy_id(port->id); + free(port); + } + break; + } + } + + pthread_mutex_unlock(&rtransport->lock); + return 0; +} + +static bool +spdk_nvmf_rdma_qpair_is_idle(struct spdk_nvmf_qpair *qpair) +{ + int cur_queue_depth, cur_rdma_rw_depth; + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + cur_queue_depth = spdk_nvmf_rdma_cur_queue_depth(rqpair); + cur_rdma_rw_depth = spdk_nvmf_rdma_cur_rw_depth(rqpair); + + if (cur_queue_depth == 0 && cur_rdma_rw_depth == 0) { + return true; + } + return false; +} + +static void +spdk_nvmf_rdma_qpair_process_pending(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_recv *rdma_recv, *recv_tmp; + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + + /* We process I/O in the data transfer pending queue at the highest priority. */ + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING], + state_link, req_tmp) { + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } + + /* The second highest priority is I/O waiting on memory buffers. */ + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->ch->pending_data_buf_queue, link, + req_tmp) { + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } + + if (rqpair->qpair_disconnected) { + spdk_nvmf_rdma_qpair_destroy(rqpair); + return; + } + + /* Do not process newly received commands if qp is in ERROR state, + * wait till the recovery is complete. + */ + if (rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + return; + } + + /* The lowest priority is processing newly received commands */ + TAILQ_FOREACH_SAFE(rdma_recv, &rqpair->incoming_queue, link, recv_tmp) { + if (TAILQ_EMPTY(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE])) { + break; + } + + rdma_req = TAILQ_FIRST(&rqpair->state_queue[RDMA_REQUEST_STATE_FREE]); + rdma_req->recv = rdma_recv; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_NEW); + if (spdk_nvmf_rdma_request_process(rtransport, rdma_req) == false) { + break; + } + } +} + +static void +spdk_nvmf_rdma_drain_state_queue(struct spdk_nvmf_rdma_qpair *rqpair, + enum spdk_nvmf_rdma_request_state state) +{ + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + struct spdk_nvmf_rdma_transport *rtransport; + + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[state], state_link, req_tmp) { + rtransport = SPDK_CONTAINEROF(rdma_req->req.qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + } +} + +static void +spdk_nvmf_rdma_qpair_recover(struct spdk_nvmf_rdma_qpair *rqpair) +{ + enum ibv_qp_state state, next_state; + int recovered; + struct spdk_nvmf_rdma_transport *rtransport; + + if (!spdk_nvmf_rdma_qpair_is_idle(&rqpair->qpair)) { + /* There must be outstanding requests down to media. + * If so, wait till they're complete. + */ + assert(!TAILQ_EMPTY(&rqpair->qpair.outstanding)); + return; + } + + state = rqpair->ibv_attr.qp_state; + next_state = state; + + SPDK_NOTICELOG("RDMA qpair %u is in state: %s\n", + rqpair->qpair.qid, + str_ibv_qp_state[state]); + + if (!(state == IBV_QPS_ERR || state == IBV_QPS_RESET)) { + SPDK_ERRLOG("Can't recover RDMA qpair %u from the state: %s\n", + rqpair->qpair.qid, + str_ibv_qp_state[state]); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); + return; + } + + recovered = 0; + while (!recovered) { + switch (state) { + case IBV_QPS_ERR: + next_state = IBV_QPS_RESET; + break; + case IBV_QPS_RESET: + next_state = IBV_QPS_INIT; + break; + case IBV_QPS_INIT: + next_state = IBV_QPS_RTR; + break; + case IBV_QPS_RTR: + next_state = IBV_QPS_RTS; + break; + case IBV_QPS_RTS: + recovered = 1; + break; + default: + SPDK_ERRLOG("RDMA qpair %u unexpected state for recovery: %u\n", + rqpair->qpair.qid, state); + goto error; + } + /* Do not transition into same state */ + if (next_state == state) { + break; + } + + if (spdk_nvmf_rdma_set_ibv_state(rqpair, next_state)) { + goto error; + } + + state = next_state; + } + + rtransport = SPDK_CONTAINEROF(rqpair->qpair.transport, + struct spdk_nvmf_rdma_transport, + transport); + + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + + return; +error: + SPDK_NOTICELOG("RDMA qpair %u: recovery failed, disconnecting...\n", + rqpair->qpair.qid); + spdk_nvmf_qpair_disconnect(&rqpair->qpair, NULL, NULL); +} + +/* Clean up only the states that can be aborted at any time */ +static void +_spdk_nvmf_rdma_qp_cleanup_safe_states(struct spdk_nvmf_rdma_qpair *rqpair) +{ + struct spdk_nvmf_rdma_request *rdma_req, *req_tmp; + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_NEW); + TAILQ_FOREACH_SAFE(rdma_req, &rqpair->state_queue[RDMA_REQUEST_STATE_NEED_BUFFER], link, req_tmp) { + TAILQ_REMOVE(&rqpair->ch->pending_data_buf_queue, rdma_req, link); + } + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_NEED_BUFFER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_DATA_TRANSFER_PENDING); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_EXECUTED); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_READY_TO_COMPLETE); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETED); +} + +/* This cleans up all memory. It is only safe to use if the rest of the software stack + * has been shut down */ +static void +_spdk_nvmf_rdma_qp_cleanup_all_states(struct spdk_nvmf_rdma_qpair *rqpair) +{ + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_EXECUTING); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETING); +} + +static void +_spdk_nvmf_rdma_qp_error(void *arg) +{ + struct spdk_nvmf_rdma_qpair *rqpair = arg; + enum ibv_qp_state state; + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + state = rqpair->ibv_attr.qp_state; + if (state != IBV_QPS_ERR) { + /* Error was already recovered */ + return; + } + + if (spdk_nvmf_qpair_is_admin_queue(&rqpair->qpair)) { + spdk_nvmf_ctrlr_abort_aer(rqpair->qpair.ctrlr); + } + + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + /* Attempt recovery. This will exit without recovering if I/O requests + * are still outstanding */ + spdk_nvmf_rdma_qpair_recover(rqpair); +} + +static void +_spdk_nvmf_rdma_qp_last_wqe(void *arg) +{ + struct spdk_nvmf_rdma_qpair *rqpair = arg; + enum ibv_qp_state state; + + spdk_nvmf_rdma_qpair_dec_refcnt(rqpair); + + state = rqpair->ibv_attr.qp_state; + if (state != IBV_QPS_ERR) { + /* Error was already recovered */ + return; + } + + /* Clear out the states that are safe to clear any time, plus the + * RDMA data transfer states. */ + _spdk_nvmf_rdma_qp_cleanup_safe_states(rqpair); + + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST); + spdk_nvmf_rdma_drain_state_queue(rqpair, RDMA_REQUEST_STATE_COMPLETING); + + spdk_nvmf_rdma_qpair_recover(rqpair); +} + +static void +spdk_nvmf_process_ib_event(struct spdk_nvmf_rdma_device *device) +{ + int rc; + struct spdk_nvmf_rdma_qpair *rqpair; + struct ibv_async_event event; + enum ibv_qp_state state; + + rc = ibv_get_async_event(device->context, &event); + + if (rc) { + SPDK_ERRLOG("Failed to get async_event (%d): %s\n", + errno, spdk_strerror(errno)); + return; + } + + SPDK_NOTICELOG("Async event: %s\n", + ibv_event_type_str(event.event_type)); + + switch (event.event_type) { + case IBV_EVENT_QP_FATAL: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_error, rqpair); + break; + case IBV_EVENT_QP_LAST_WQE_REACHED: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_last_wqe, rqpair); + break; + case IBV_EVENT_SQ_DRAINED: + /* This event occurs frequently in both error and non-error states. + * Check if the qpair is in an error state before sending a message. + * Note that we're not on the correct thread to access the qpair, but + * the operations that the below calls make all happen to be thread + * safe. */ + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + state = spdk_nvmf_rdma_update_ibv_state(rqpair); + if (state == IBV_QPS_ERR) { + spdk_nvmf_rdma_qpair_inc_refcnt(rqpair); + spdk_thread_send_msg(rqpair->qpair.group->thread, _spdk_nvmf_rdma_qp_error, rqpair); + } + break; + case IBV_EVENT_QP_REQ_ERR: + case IBV_EVENT_QP_ACCESS_ERR: + case IBV_EVENT_COMM_EST: + case IBV_EVENT_PATH_MIG: + case IBV_EVENT_PATH_MIG_ERR: + rqpair = event.element.qp->qp_context; + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, + (uintptr_t)rqpair->cm_id, event.event_type); + spdk_nvmf_rdma_update_ibv_state(rqpair); + break; + case IBV_EVENT_CQ_ERR: + case IBV_EVENT_DEVICE_FATAL: + case IBV_EVENT_PORT_ACTIVE: + case IBV_EVENT_PORT_ERR: + case IBV_EVENT_LID_CHANGE: + case IBV_EVENT_PKEY_CHANGE: + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_SRQ_ERR: + case IBV_EVENT_SRQ_LIMIT_REACHED: + case IBV_EVENT_CLIENT_REREGISTER: + case IBV_EVENT_GID_CHANGE: + default: + spdk_trace_record(TRACE_RDMA_IBV_ASYNC_EVENT, 0, 0, 0, event.event_type); + break; + } + ibv_ack_async_event(&event); +} + +static void +spdk_nvmf_rdma_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + int nfds, i = 0; + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_device *device, *tmp; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + nfds = poll(rtransport->poll_fds, rtransport->npoll_fds, 0); + + if (nfds <= 0) { + return; + } + + /* The first poll descriptor is RDMA CM event */ + if (rtransport->poll_fds[i++].revents & POLLIN) { + spdk_nvmf_process_cm_event(transport, cb_fn); + nfds--; + } + + if (nfds == 0) { + return; + } + + /* Second and subsequent poll descriptors are IB async events */ + TAILQ_FOREACH_SAFE(device, &rtransport->devices, link, tmp) { + if (rtransport->poll_fds[i++].revents & POLLIN) { + spdk_nvmf_process_ib_event(device); + nfds--; + } + } + /* check all flagged fd's have been served */ + assert(nfds == 0); +} + +static void +spdk_nvmf_rdma_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + entry->trtype = SPDK_NVMF_TRTYPE_RDMA; + entry->adrfam = trid->adrfam; + entry->treq.secure_channel = SPDK_NVMF_TREQ_SECURE_CHANNEL_NOT_SPECIFIED; + + spdk_strcpy_pad(entry->trsvcid, trid->trsvcid, sizeof(entry->trsvcid), ' '); + spdk_strcpy_pad(entry->traddr, trid->traddr, sizeof(entry->traddr), ' '); + + entry->tsas.rdma.rdma_qptype = SPDK_NVMF_RDMA_QPTYPE_RELIABLE_CONNECTED; + entry->tsas.rdma.rdma_prtype = SPDK_NVMF_RDMA_PRTYPE_NONE; + entry->tsas.rdma.rdma_cms = SPDK_NVMF_RDMA_CMS_RDMA_CM; +} + +static struct spdk_nvmf_transport_poll_group * +spdk_nvmf_rdma_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *poller; + struct spdk_nvmf_rdma_device *device; + + rtransport = SPDK_CONTAINEROF(transport, struct spdk_nvmf_rdma_transport, transport); + + rgroup = calloc(1, sizeof(*rgroup)); + if (!rgroup) { + return NULL; + } + + TAILQ_INIT(&rgroup->pollers); + + pthread_mutex_lock(&rtransport->lock); + TAILQ_FOREACH(device, &rtransport->devices, link) { + poller = calloc(1, sizeof(*poller)); + if (!poller) { + SPDK_ERRLOG("Unable to allocate memory for new RDMA poller\n"); + free(rgroup); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + poller->device = device; + poller->group = rgroup; + + TAILQ_INIT(&poller->qpairs); + + poller->cq = ibv_create_cq(device->context, NVMF_RDMA_CQ_SIZE, poller, NULL, 0); + if (!poller->cq) { + SPDK_ERRLOG("Unable to create completion queue\n"); + free(poller); + free(rgroup); + pthread_mutex_unlock(&rtransport->lock); + return NULL; + } + + TAILQ_INSERT_TAIL(&rgroup->pollers, poller, link); + } + + pthread_mutex_unlock(&rtransport->lock); + return &rgroup->group; +} + +static void +spdk_nvmf_rdma_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *poller, *tmp; + struct spdk_nvmf_rdma_qpair *qpair, *tmp_qpair; + + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + + if (!rgroup) { + return; + } + + TAILQ_FOREACH_SAFE(poller, &rgroup->pollers, link, tmp) { + TAILQ_REMOVE(&rgroup->pollers, poller, link); + + if (poller->cq) { + ibv_destroy_cq(poller->cq); + } + TAILQ_FOREACH_SAFE(qpair, &poller->qpairs, link, tmp_qpair) { + _spdk_nvmf_rdma_qp_cleanup_all_states(qpair); + spdk_nvmf_rdma_qpair_destroy(qpair); + } + + free(poller); + } + + free(rgroup); +} + +static int +spdk_nvmf_rdma_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_qpair *rqpair; + struct spdk_nvmf_rdma_device *device; + struct spdk_nvmf_rdma_poller *poller; + int rc; + + rtransport = SPDK_CONTAINEROF(qpair->transport, struct spdk_nvmf_rdma_transport, transport); + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + device = rqpair->port->device; + + TAILQ_FOREACH(poller, &rgroup->pollers, link) { + if (poller->device == device) { + break; + } + } + + if (!poller) { + SPDK_ERRLOG("No poller found for device.\n"); + return -1; + } + + TAILQ_INSERT_TAIL(&poller->qpairs, rqpair, link); + rqpair->poller = poller; + + rc = spdk_nvmf_rdma_qpair_initialize(qpair); + if (rc < 0) { + SPDK_ERRLOG("Failed to initialize nvmf_rdma_qpair with qpair=%p\n", qpair); + return -1; + } + + rqpair->mgmt_channel = spdk_get_io_channel(rtransport); + if (!rqpair->mgmt_channel) { + spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + rqpair->ch = spdk_io_channel_get_ctx(rqpair->mgmt_channel); + assert(rqpair->ch != NULL); + + rc = spdk_nvmf_rdma_event_accept(rqpair->cm_id, rqpair); + if (rc) { + /* Try to reject, but we probably can't */ + spdk_nvmf_rdma_event_reject(rqpair->cm_id, SPDK_NVMF_RDMA_ERROR_NO_RESOURCES); + spdk_nvmf_rdma_qpair_destroy(rqpair); + return -1; + } + + spdk_nvmf_rdma_update_ibv_state(rqpair); + + return 0; +} + +static int +spdk_nvmf_rdma_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + + if (rdma_req->data_from_pool) { + /* Put the buffer/s back in the pool */ + for (uint32_t i = 0; i < rdma_req->req.iovcnt; i++) { + spdk_mempool_put(rtransport->data_buf_pool, rdma_req->data.buffers[i]); + rdma_req->req.iov[i].iov_base = NULL; + rdma_req->data.buffers[i] = NULL; + } + rdma_req->data_from_pool = false; + } + rdma_req->req.length = 0; + rdma_req->req.iovcnt = 0; + rdma_req->req.data = NULL; + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_FREE); + return 0; +} + +static int +spdk_nvmf_rdma_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_rdma_transport *rtransport = SPDK_CONTAINEROF(req->qpair->transport, + struct spdk_nvmf_rdma_transport, transport); + struct spdk_nvmf_rdma_request *rdma_req = SPDK_CONTAINEROF(req, + struct spdk_nvmf_rdma_request, req); + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, + struct spdk_nvmf_rdma_qpair, qpair); + + if (rqpair->ibv_attr.qp_state != IBV_QPS_ERR) { + /* The connection is alive, so process the request as normal */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_EXECUTED); + } else { + /* The connection is dead. Move the request directly to the completed state. */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + } + + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + if (rqpair->qpair.state == SPDK_NVMF_QPAIR_ACTIVE && rqpair->ibv_attr.qp_state == IBV_QPS_ERR) { + /* If the NVMe-oF layer thinks the connection is active, but the RDMA layer thinks + * the connection is dead, perform error recovery. */ + spdk_nvmf_rdma_qpair_recover(rqpair); + } + + return 0; +} + +static void +spdk_nvmf_rdma_close_qpair(struct spdk_nvmf_qpair *qpair) +{ + struct spdk_nvmf_rdma_qpair *rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + spdk_nvmf_rdma_qpair_destroy(rqpair); +} + +static struct spdk_nvmf_rdma_request * +get_rdma_req_from_wc(struct ibv_wc *wc) +{ + struct spdk_nvmf_rdma_request *rdma_req; + + rdma_req = (struct spdk_nvmf_rdma_request *)wc->wr_id; + assert(rdma_req != NULL); + +#ifdef DEBUG + struct spdk_nvmf_rdma_qpair *rqpair; + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(rdma_req - rqpair->reqs >= 0); + assert(rdma_req - rqpair->reqs < (ptrdiff_t)rqpair->max_queue_depth); +#endif + + return rdma_req; +} + +static struct spdk_nvmf_rdma_recv * +get_rdma_recv_from_wc(struct ibv_wc *wc) +{ + struct spdk_nvmf_rdma_recv *rdma_recv; + + assert(wc->byte_len >= sizeof(struct spdk_nvmf_capsule_cmd)); + + rdma_recv = (struct spdk_nvmf_rdma_recv *)wc->wr_id; + assert(rdma_recv != NULL); + +#ifdef DEBUG + struct spdk_nvmf_rdma_qpair *rqpair = rdma_recv->qpair; + + assert(rdma_recv - rqpair->recvs >= 0); + assert(rdma_recv - rqpair->recvs < (ptrdiff_t)rqpair->max_queue_depth); +#endif + + return rdma_recv; +} + +#ifdef DEBUG +static int +spdk_nvmf_rdma_req_is_completing(struct spdk_nvmf_rdma_request *rdma_req) +{ + return rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_CONTROLLER_TO_HOST || + rdma_req->state == RDMA_REQUEST_STATE_COMPLETING; +} +#endif + +static int +spdk_nvmf_rdma_poller_poll(struct spdk_nvmf_rdma_transport *rtransport, + struct spdk_nvmf_rdma_poller *rpoller) +{ + struct ibv_wc wc[32]; + struct spdk_nvmf_rdma_request *rdma_req; + struct spdk_nvmf_rdma_recv *rdma_recv; + struct spdk_nvmf_rdma_qpair *rqpair; + int reaped, i; + int count = 0; + bool error = false; + + /* Poll for completing operations. */ + reaped = ibv_poll_cq(rpoller->cq, 32, wc); + if (reaped < 0) { + SPDK_ERRLOG("Error polling CQ! (%d): %s\n", + errno, spdk_strerror(errno)); + return -1; + } + + for (i = 0; i < reaped; i++) { + /* Handle error conditions */ + if (wc[i].status) { + SPDK_WARNLOG("CQ error on CQ %p, Request 0x%lu (%d): %s\n", + rpoller->cq, wc[i].wr_id, wc[i].status, ibv_wc_status_str(wc[i].status)); + error = true; + + switch (wc[i].opcode) { + case IBV_WC_SEND: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* We're going to attempt an error recovery, so force the request into + * the completed state. */ + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + break; + case IBV_WC_RECV: + rdma_recv = get_rdma_recv_from_wc(&wc[i]); + rqpair = rdma_recv->qpair; + + /* Dump this into the incoming queue. This gets cleaned up when + * the queue pair disconnects or recovers. */ + TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); + break; + case IBV_WC_RDMA_WRITE: + case IBV_WC_RDMA_READ: + /* If the data transfer fails still force the queue into the error state, + * but the rdma_req objects should only be manipulated in response to + * SEND and RECV operations. */ + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + break; + default: + SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); + continue; + } + + /* Set the qpair to the error state. This will initiate a recovery. */ + spdk_nvmf_rdma_set_ibv_state(rqpair, IBV_QPS_ERR); + continue; + } + + switch (wc[i].opcode) { + case IBV_WC_SEND: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(spdk_nvmf_rdma_req_is_completing(rdma_req)); + + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_COMPLETED); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + count++; + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RDMA_WRITE: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RDMA_READ: + rdma_req = get_rdma_req_from_wc(&wc[i]); + rqpair = SPDK_CONTAINEROF(rdma_req->req.qpair, struct spdk_nvmf_rdma_qpair, qpair); + + assert(rdma_req->state == RDMA_REQUEST_STATE_TRANSFERRING_HOST_TO_CONTROLLER); + spdk_nvmf_rdma_request_set_state(rdma_req, RDMA_REQUEST_STATE_READY_TO_EXECUTE); + spdk_nvmf_rdma_request_process(rtransport, rdma_req); + + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + case IBV_WC_RECV: + rdma_recv = get_rdma_recv_from_wc(&wc[i]); + rqpair = rdma_recv->qpair; + + TAILQ_INSERT_TAIL(&rqpair->incoming_queue, rdma_recv, link); + /* Try to process other queued requests */ + spdk_nvmf_rdma_qpair_process_pending(rtransport, rqpair); + break; + + default: + SPDK_ERRLOG("Received an unknown opcode on the CQ: %d\n", wc[i].opcode); + continue; + } + } + + if (error == true) { + return -1; + } + + return count; +} + +static int +spdk_nvmf_rdma_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + struct spdk_nvmf_rdma_transport *rtransport; + struct spdk_nvmf_rdma_poll_group *rgroup; + struct spdk_nvmf_rdma_poller *rpoller; + int count, rc; + + rtransport = SPDK_CONTAINEROF(group->transport, struct spdk_nvmf_rdma_transport, transport); + rgroup = SPDK_CONTAINEROF(group, struct spdk_nvmf_rdma_poll_group, group); + + count = 0; + TAILQ_FOREACH(rpoller, &rgroup->pollers, link) { + rc = spdk_nvmf_rdma_poller_poll(rtransport, rpoller); + if (rc < 0) { + return rc; + } + count += rc; + } + + return count; +} + +static int +spdk_nvmf_rdma_trid_from_cm_id(struct rdma_cm_id *id, + struct spdk_nvme_transport_id *trid, + bool peer) +{ + struct sockaddr *saddr; + uint16_t port; + + trid->trtype = SPDK_NVME_TRANSPORT_RDMA; + + if (peer) { + saddr = rdma_get_peer_addr(id); + } else { + saddr = rdma_get_local_addr(id); + } + switch (saddr->sa_family) { + case AF_INET: { + struct sockaddr_in *saddr_in = (struct sockaddr_in *)saddr; + + trid->adrfam = SPDK_NVMF_ADRFAM_IPV4; + inet_ntop(AF_INET, &saddr_in->sin_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + case AF_INET6: { + struct sockaddr_in6 *saddr_in = (struct sockaddr_in6 *)saddr; + trid->adrfam = SPDK_NVMF_ADRFAM_IPV6; + inet_ntop(AF_INET6, &saddr_in->sin6_addr, + trid->traddr, sizeof(trid->traddr)); + if (peer) { + port = ntohs(rdma_get_dst_port(id)); + } else { + port = ntohs(rdma_get_src_port(id)); + } + snprintf(trid->trsvcid, sizeof(trid->trsvcid), "%u", port); + break; + } + default: + return -1; + + } + + return 0; +} + +static int +spdk_nvmf_rdma_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, true); +} + +static int +spdk_nvmf_rdma_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->cm_id, trid, false); +} + +static int +spdk_nvmf_rdma_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_rdma_qpair *rqpair; + + rqpair = SPDK_CONTAINEROF(qpair, struct spdk_nvmf_rdma_qpair, qpair); + + return spdk_nvmf_rdma_trid_from_cm_id(rqpair->listen_id, trid, false); +} + +const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma = { + .type = SPDK_NVME_TRANSPORT_RDMA, + .opts_init = spdk_nvmf_rdma_opts_init, + .create = spdk_nvmf_rdma_create, + .destroy = spdk_nvmf_rdma_destroy, + + .listen = spdk_nvmf_rdma_listen, + .stop_listen = spdk_nvmf_rdma_stop_listen, + .accept = spdk_nvmf_rdma_accept, + + .listener_discover = spdk_nvmf_rdma_discover, + + .poll_group_create = spdk_nvmf_rdma_poll_group_create, + .poll_group_destroy = spdk_nvmf_rdma_poll_group_destroy, + .poll_group_add = spdk_nvmf_rdma_poll_group_add, + .poll_group_poll = spdk_nvmf_rdma_poll_group_poll, + + .req_free = spdk_nvmf_rdma_request_free, + .req_complete = spdk_nvmf_rdma_request_complete, + + .qpair_fini = spdk_nvmf_rdma_close_qpair, + .qpair_is_idle = spdk_nvmf_rdma_qpair_is_idle, + .qpair_get_peer_trid = spdk_nvmf_rdma_qpair_get_peer_trid, + .qpair_get_local_trid = spdk_nvmf_rdma_qpair_get_local_trid, + .qpair_get_listen_trid = spdk_nvmf_rdma_qpair_get_listen_trid, + +}; + +SPDK_LOG_REGISTER_COMPONENT("rdma", SPDK_LOG_RDMA) diff --git a/src/spdk/lib/nvmf/request.c b/src/spdk/lib/nvmf/request.c new file mode 100644 index 00000000..88b6b9a9 --- /dev/null +++ b/src/spdk/lib/nvmf/request.c @@ -0,0 +1,190 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/thread.h" +#include "spdk/likely.h" +#include "spdk/nvme.h" +#include "spdk/nvmf_spec.h" +#include "spdk/trace.h" + +#include "spdk_internal/assert.h" +#include "spdk_internal/log.h" + +static void +spdk_nvmf_qpair_request_cleanup(struct spdk_nvmf_qpair *qpair) +{ + if (qpair->state == SPDK_NVMF_QPAIR_DEACTIVATING) { + assert(qpair->state_cb != NULL); + + if (TAILQ_EMPTY(&qpair->outstanding)) { + qpair->state_cb(qpair->state_cb_arg, 0); + } + } else { + assert(qpair->state == SPDK_NVMF_QPAIR_ACTIVE); + } +} + +int +spdk_nvmf_request_free(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (spdk_nvmf_transport_req_free(req)) { + SPDK_ERRLOG("Unable to free transport level request resources.\n"); + } + + spdk_nvmf_qpair_request_cleanup(qpair); + + return 0; +} + +int +spdk_nvmf_request_complete(struct spdk_nvmf_request *req) +{ + struct spdk_nvme_cpl *rsp = &req->rsp->nvme_cpl; + struct spdk_nvmf_qpair *qpair; + + rsp->sqid = 0; + rsp->status.p = 0; + rsp->cid = req->cmd->nvme_cmd.cid; + + qpair = req->qpair; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "cpl: cid=%u cdw0=0x%08x rsvd1=%u status=0x%04x\n", + rsp->cid, rsp->cdw0, rsp->rsvd1, + *(uint16_t *)&rsp->status); + + TAILQ_REMOVE(&qpair->outstanding, req, link); + if (spdk_nvmf_transport_req_complete(req)) { + SPDK_ERRLOG("Transport request completion error!\n"); + } + + spdk_nvmf_qpair_request_cleanup(qpair); + + return 0; +} + +static void +nvmf_trace_command(union nvmf_h2c_msg *h2c_msg, bool is_admin_queue) +{ + struct spdk_nvmf_capsule_cmd *cap_hdr = &h2c_msg->nvmf_cmd; + struct spdk_nvme_cmd *cmd = &h2c_msg->nvme_cmd; + struct spdk_nvme_sgl_descriptor *sgl = &cmd->dptr.sgl1; + uint8_t opc; + + if (cmd->opc == SPDK_NVME_OPC_FABRIC) { + opc = cap_hdr->fctype; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s Fabrics cmd: fctype 0x%02x cid %u\n", + is_admin_queue ? "Admin" : "I/O", + cap_hdr->fctype, cap_hdr->cid); + } else { + opc = cmd->opc; + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "%s cmd: opc 0x%02x fuse %u cid %u nsid %u cdw10 0x%08x\n", + is_admin_queue ? "Admin" : "I/O", + cmd->opc, cmd->fuse, cmd->cid, cmd->nsid, cmd->cdw10); + if (cmd->mptr) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "mptr 0x%" PRIx64 "\n", cmd->mptr); + } + if (cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_CONTIG && + cmd->psdt != SPDK_NVME_PSDT_SGL_MPTR_SGL) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "psdt %u\n", cmd->psdt); + } + } + + if (spdk_nvme_opc_get_data_transfer(opc) != SPDK_NVME_DATA_NONE) { + if (sgl->generic.type == SPDK_NVME_SGL_TYPE_KEYED_DATA_BLOCK) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, + "SGL: Keyed%s: addr 0x%" PRIx64 " key 0x%x len 0x%x\n", + sgl->generic.subtype == SPDK_NVME_SGL_SUBTYPE_INVALIDATE_KEY ? " (Inv)" : "", + sgl->address, sgl->keyed.key, sgl->keyed.length); + } else if (sgl->generic.type == SPDK_NVME_SGL_TYPE_DATA_BLOCK) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL: Data block: %s 0x%" PRIx64 " len 0x%x\n", + sgl->unkeyed.subtype == SPDK_NVME_SGL_SUBTYPE_OFFSET ? "offs" : "addr", + sgl->address, sgl->unkeyed.length); + } else { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "SGL type 0x%x subtype 0x%x\n", + sgl->generic.type, sgl->generic.subtype); + } + } +} + +void +spdk_nvmf_request_exec(struct spdk_nvmf_request *req) +{ + struct spdk_nvmf_qpair *qpair = req->qpair; + spdk_nvmf_request_exec_status status; + + nvmf_trace_command(req->cmd, spdk_nvmf_qpair_is_admin_queue(qpair)); + + if (qpair->state != SPDK_NVMF_QPAIR_ACTIVE) { + req->rsp->nvme_cpl.status.sct = SPDK_NVME_SCT_GENERIC; + req->rsp->nvme_cpl.status.sc = SPDK_NVME_SC_COMMAND_SEQUENCE_ERROR; + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + spdk_nvmf_request_complete(req); + return; + } + + /* Check if the subsystem is paused (if there is a subsystem) */ + if (qpair->ctrlr) { + struct spdk_nvmf_subsystem_poll_group *sgroup = &qpair->group->sgroups[qpair->ctrlr->subsys->id]; + if (sgroup->state != SPDK_NVMF_SUBSYSTEM_ACTIVE) { + /* The subsystem is not currently active. Queue this request. */ + TAILQ_INSERT_TAIL(&sgroup->queued, req, link); + return; + } + + } + + /* Place the request on the outstanding list so we can keep track of it */ + TAILQ_INSERT_TAIL(&qpair->outstanding, req, link); + + if (spdk_unlikely(req->cmd->nvmf_cmd.opcode == SPDK_NVME_OPC_FABRIC)) { + status = spdk_nvmf_ctrlr_process_fabrics_cmd(req); + } else if (spdk_unlikely(spdk_nvmf_qpair_is_admin_queue(qpair))) { + status = spdk_nvmf_ctrlr_process_admin_cmd(req); + } else { + status = spdk_nvmf_ctrlr_process_io_cmd(req); + } + + if (status == SPDK_NVMF_REQUEST_EXEC_STATUS_COMPLETE) { + spdk_nvmf_request_complete(req); + } +} diff --git a/src/spdk/lib/nvmf/subsystem.c b/src/spdk/lib/nvmf/subsystem.c new file mode 100644 index 00000000..9e28f3c6 --- /dev/null +++ b/src/spdk/lib/nvmf/subsystem.c @@ -0,0 +1,1269 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/event.h" +#include "spdk/likely.h" +#include "spdk/string.h" +#include "spdk/trace.h" +#include "spdk/nvmf_spec.h" +#include "spdk/uuid.h" + +#include "spdk/bdev_module.h" +#include "spdk_internal/log.h" +#include "spdk_internal/utf.h" + +/* + * States for parsing valid domains in NQNs according to RFC 1034 + */ +enum spdk_nvmf_nqn_domain_states { + /* First character of a domain must be a letter */ + SPDK_NVMF_DOMAIN_ACCEPT_LETTER = 0, + + /* Subsequent characters can be any of letter, digit, or hyphen */ + SPDK_NVMF_DOMAIN_ACCEPT_LDH = 1, + + /* A domain label must end with either a letter or digit */ + SPDK_NVMF_DOMAIN_ACCEPT_ANY = 2 +}; + +/* Returns true if is a valid ASCII string as defined by the NVMe spec */ +static bool +spdk_nvmf_valid_ascii_string(const void *buf, size_t size) +{ + const uint8_t *str = buf; + size_t i; + + for (i = 0; i < size; i++) { + if (str[i] < 0x20 || str[i] > 0x7E) { + return false; + } + } + + return true; +} + +static bool +spdk_nvmf_valid_nqn(const char *nqn) +{ + size_t len; + struct spdk_uuid uuid_value; + uint32_t i; + int bytes_consumed; + uint32_t domain_label_length; + char *reverse_domain_end; + uint32_t reverse_domain_end_index; + enum spdk_nvmf_nqn_domain_states domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + + /* Check for length requirements */ + len = strlen(nqn); + if (len > SPDK_NVMF_NQN_MAX_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu > max %d\n", nqn, len, SPDK_NVMF_NQN_MAX_LEN); + return false; + } + + /* The nqn must be at least as long as SPDK_NVMF_NQN_MIN_LEN to contain the necessary prefix. */ + if (len < SPDK_NVMF_NQN_MIN_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": length %zu < min %d\n", nqn, len, SPDK_NVMF_NQN_MIN_LEN); + return false; + } + + /* Check for discovery controller nqn */ + if (!strcmp(nqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + /* Check for equality with the generic nqn structure of the form "nqn.2014-08.org.nvmexpress:uuid:11111111-2222-3333-4444-555555555555" */ + if (!strncmp(nqn, SPDK_NVMF_NQN_UUID_PRE, SPDK_NVMF_NQN_UUID_PRE_LEN)) { + if (len != SPDK_NVMF_NQN_UUID_PRE_LEN + SPDK_NVMF_UUID_STRING_LEN) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not the correct length\n", nqn); + return false; + } + + if (spdk_uuid_parse(&uuid_value, &nqn[SPDK_NVMF_NQN_UUID_PRE_LEN])) { + SPDK_ERRLOG("Invalid NQN \"%s\": uuid is not formatted correctly\n", nqn); + return false; + } + return true; + } + + /* If the nqn does not match the uuid structure, the next several checks validate the form "nqn.yyyy-mm.reverse.domain:user-string" */ + + if (strncmp(nqn, "nqn.", 4) != 0) { + SPDK_ERRLOG("Invalid NQN \"%s\": NQN must begin with \"nqn.\".\n", nqn); + return false; + } + + /* Check for yyyy-mm. */ + if (!(isdigit(nqn[4]) && isdigit(nqn[5]) && isdigit(nqn[6]) && isdigit(nqn[7]) && + nqn[8] == '-' && isdigit(nqn[9]) && isdigit(nqn[10]) && nqn[11] == '.')) { + SPDK_ERRLOG("Invalid date code in NQN \"%s\"\n", nqn); + return false; + } + + reverse_domain_end = strchr(nqn, ':'); + if (reverse_domain_end != NULL && (reverse_domain_end_index = reverse_domain_end - nqn) < len - 1) { + } else { + SPDK_ERRLOG("Invalid NQN \"%s\". NQN must contain user specified name with a ':' as a prefix.\n", + nqn); + return false; + } + + /* Check for valid reverse domain */ + domain_label_length = 0; + for (i = 12; i < reverse_domain_end_index; i++) { + if (domain_label_length > SPDK_DOMAIN_LABEL_MAX_LEN) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". At least one Label is too long.\n", nqn); + return false; + } + + switch (domain_state) { + + case SPDK_NVMF_DOMAIN_ACCEPT_LETTER: { + if (isalpha(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must start with a letter.\n", nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_LDH: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + + case SPDK_NVMF_DOMAIN_ACCEPT_ANY: { + if (isalpha(nqn[i]) || isdigit(nqn[i])) { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_ANY; + domain_label_length++; + break; + } else if (nqn[i] == '-') { + if (i == reverse_domain_end_index - 1) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must end with an alphanumeric symbol.\n", + nqn); + return false; + } + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LDH; + domain_label_length++; + break; + } else if (nqn[i] == '.') { + domain_state = SPDK_NVMF_DOMAIN_ACCEPT_LETTER; + domain_label_length = 0; + break; + } else { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only [a-z,A-Z,0-9,'-','.'].\n", + nqn); + return false; + } + } + } + } + + i = reverse_domain_end_index + 1; + while (i < len) { + bytes_consumed = utf8_valid(&nqn[i], &nqn[len]); + if (bytes_consumed <= 0) { + SPDK_ERRLOG("Invalid domain name in NQN \"%s\". Label names must contain only valid utf-8.\n", nqn); + return false; + } + + i += bytes_consumed; + } + return true; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_create(struct spdk_nvmf_tgt *tgt, + const char *nqn, + enum spdk_nvmf_subtype type, + uint32_t num_ns) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + if (spdk_nvmf_tgt_find_subsystem(tgt, nqn)) { + SPDK_ERRLOG("Subsystem NQN '%s' already exists\n", nqn); + return NULL; + } + + if (!spdk_nvmf_valid_nqn(nqn)) { + return NULL; + } + + if (type == SPDK_NVMF_SUBTYPE_DISCOVERY && num_ns != 0) { + SPDK_ERRLOG("Discovery subsystem cannot have namespaces.\n"); + return NULL; + } + + /* Find a free subsystem id (sid) */ + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + if (tgt->subsystems[sid] == NULL) { + break; + } + } + if (sid >= tgt->opts.max_subsystems) { + return NULL; + } + + subsystem = calloc(1, sizeof(struct spdk_nvmf_subsystem)); + if (subsystem == NULL) { + return NULL; + } + + subsystem->thread = spdk_get_thread(); + subsystem->state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + subsystem->tgt = tgt; + subsystem->id = sid; + subsystem->subtype = type; + subsystem->max_nsid = num_ns; + subsystem->max_allowed_nsid = num_ns; + subsystem->next_cntlid = 0; + snprintf(subsystem->subnqn, sizeof(subsystem->subnqn), "%s", nqn); + TAILQ_INIT(&subsystem->listeners); + TAILQ_INIT(&subsystem->hosts); + TAILQ_INIT(&subsystem->ctrlrs); + + if (num_ns != 0) { + subsystem->ns = calloc(num_ns, sizeof(struct spdk_nvmf_ns *)); + if (subsystem->ns == NULL) { + SPDK_ERRLOG("Namespace memory allocation failed\n"); + free(subsystem); + return NULL; + } + } + + memset(subsystem->sn, '0', sizeof(subsystem->sn) - 1); + subsystem->sn[sizeof(subsystem->sn) - 1] = '\n'; + + tgt->subsystems[sid] = subsystem; + tgt->discovery_genctr++; + + return subsystem; +} + +static void +_spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_host *host) +{ + TAILQ_REMOVE(&subsystem->hosts, host, link); + free(host->nqn); + free(host); +} + +static int _spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid); + +void +spdk_nvmf_subsystem_destroy(struct spdk_nvmf_subsystem *subsystem) +{ + struct spdk_nvmf_listener *listener, *listener_tmp; + struct spdk_nvmf_host *host, *host_tmp; + struct spdk_nvmf_ctrlr *ctrlr, *ctrlr_tmp; + struct spdk_nvmf_ns *ns; + + if (!subsystem) { + return; + } + + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE); + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "subsystem is %p\n", subsystem); + + TAILQ_FOREACH_SAFE(listener, &subsystem->listeners, link, listener_tmp) { + TAILQ_REMOVE(&subsystem->listeners, listener, link); + free(listener); + } + + TAILQ_FOREACH_SAFE(host, &subsystem->hosts, link, host_tmp) { + _spdk_nvmf_subsystem_remove_host(subsystem, host); + } + + TAILQ_FOREACH_SAFE(ctrlr, &subsystem->ctrlrs, link, ctrlr_tmp) { + spdk_nvmf_ctrlr_destruct(ctrlr); + } + + ns = spdk_nvmf_subsystem_get_first_ns(subsystem); + while (ns != NULL) { + struct spdk_nvmf_ns *next_ns = spdk_nvmf_subsystem_get_next_ns(subsystem, ns); + + _spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid); + ns = next_ns; + } + + free(subsystem->ns); + + subsystem->tgt->subsystems[subsystem->id] = NULL; + subsystem->tgt->discovery_genctr++; + + free(subsystem); +} + +static int +spdk_nvmf_subsystem_set_state(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state state) +{ + enum spdk_nvmf_subsystem_state actual_old_state, expected_old_state; + + switch (state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_INACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + case SPDK_NVMF_SUBSYSTEM_RESUMING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_PAUSED; + break; + case SPDK_NVMF_SUBSYSTEM_DEACTIVATING: + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVE; + break; + default: + assert(false); + return -1; + } + + actual_old_state = __sync_val_compare_and_swap(&subsystem->state, expected_old_state, state); + if (actual_old_state != expected_old_state) { + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_RESUMING && + state == SPDK_NVMF_SUBSYSTEM_ACTIVE) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } + /* This is for the case when activating the subsystem fails. */ + if (actual_old_state == SPDK_NVMF_SUBSYSTEM_ACTIVATING && + state == SPDK_NVMF_SUBSYSTEM_DEACTIVATING) { + expected_old_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + actual_old_state = __sync_val_compare_and_swap(&subsystem->state, expected_old_state, state); + } + assert(actual_old_state == expected_old_state); + return actual_old_state - expected_old_state; +} + +struct subsystem_state_change_ctx { + struct spdk_nvmf_subsystem *subsystem; + + enum spdk_nvmf_subsystem_state requested_state; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_state_change_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_state_change_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (status == 0) { + status = spdk_nvmf_subsystem_set_state(ctx->subsystem, ctx->requested_state); + if (status) { + status = -1; + } + } + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_state_change_continue(void *ctx, int status) +{ + struct spdk_io_channel_iter *i = ctx; + spdk_for_each_channel_continue(i, status); +} + +static void +subsystem_state_change_on_pg(struct spdk_io_channel_iter *i) +{ + struct subsystem_state_change_ctx *ctx; + struct spdk_io_channel *ch; + struct spdk_nvmf_poll_group *group; + + ctx = spdk_io_channel_iter_get_ctx(i); + ch = spdk_io_channel_iter_get_channel(i); + group = spdk_io_channel_get_ctx(ch); + + switch (ctx->requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + spdk_nvmf_poll_group_remove_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_ACTIVATING) { + spdk_nvmf_poll_group_add_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } else if (ctx->subsystem->state == SPDK_NVMF_SUBSYSTEM_RESUMING) { + spdk_nvmf_poll_group_resume_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + spdk_nvmf_poll_group_pause_subsystem(group, ctx->subsystem, subsystem_state_change_continue, i); + break; + default: + assert(false); + break; + } +} + +static int +spdk_nvmf_subsystem_state_change(struct spdk_nvmf_subsystem *subsystem, + enum spdk_nvmf_subsystem_state requested_state, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + struct subsystem_state_change_ctx *ctx; + enum spdk_nvmf_subsystem_state intermediate_state; + int rc; + + switch (requested_state) { + case SPDK_NVMF_SUBSYSTEM_INACTIVE: + intermediate_state = SPDK_NVMF_SUBSYSTEM_DEACTIVATING; + break; + case SPDK_NVMF_SUBSYSTEM_ACTIVE: + if (subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED) { + intermediate_state = SPDK_NVMF_SUBSYSTEM_RESUMING; + } else { + intermediate_state = SPDK_NVMF_SUBSYSTEM_ACTIVATING; + } + break; + case SPDK_NVMF_SUBSYSTEM_PAUSED: + intermediate_state = SPDK_NVMF_SUBSYSTEM_PAUSING; + break; + default: + assert(false); + return -EINVAL; + } + + ctx = calloc(1, sizeof(*ctx)); + if (!ctx) { + return -ENOMEM; + } + + rc = spdk_nvmf_subsystem_set_state(subsystem, intermediate_state); + if (rc) { + free(ctx); + return rc; + } + + ctx->subsystem = subsystem; + ctx->requested_state = requested_state; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_for_each_channel(subsystem->tgt, + subsystem_state_change_on_pg, + ctx, + subsystem_state_change_done); + + return 0; +} + +int +spdk_nvmf_subsystem_start(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_stop(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_INACTIVE, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_pause(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_PAUSED, cb_fn, cb_arg); +} + +int +spdk_nvmf_subsystem_resume(struct spdk_nvmf_subsystem *subsystem, + spdk_nvmf_subsystem_state_change_done cb_fn, + void *cb_arg) +{ + return spdk_nvmf_subsystem_state_change(subsystem, SPDK_NVMF_SUBSYSTEM_ACTIVE, cb_fn, cb_arg); +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_first(struct spdk_nvmf_tgt *tgt) +{ + struct spdk_nvmf_subsystem *subsystem; + uint32_t sid; + + for (sid = 0; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +struct spdk_nvmf_subsystem * +spdk_nvmf_subsystem_get_next(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t sid; + struct spdk_nvmf_tgt *tgt; + + if (!subsystem) { + return NULL; + } + + tgt = subsystem->tgt; + + for (sid = subsystem->id + 1; sid < tgt->opts.max_subsystems; sid++) { + subsystem = tgt->subsystems[sid]; + if (subsystem) { + return subsystem; + } + } + + return NULL; +} + +static struct spdk_nvmf_host * +_spdk_nvmf_subsystem_find_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host = NULL; + + TAILQ_FOREACH(host, &subsystem->hosts, link) { + if (strcmp(hostnqn, host->nqn) == 0) { + return host; + } + } + + return NULL; +} + +int +spdk_nvmf_subsystem_add_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!spdk_nvmf_valid_nqn(hostnqn)) { + return -EINVAL; + } + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + if (_spdk_nvmf_subsystem_find_host(subsystem, hostnqn)) { + /* This subsystem already allows the specified host. */ + return 0; + } + + host = calloc(1, sizeof(*host)); + if (!host) { + return -ENOMEM; + } + host->nqn = strdup(hostnqn); + if (!host->nqn) { + free(host); + return -ENOMEM; + } + + TAILQ_INSERT_HEAD(&subsystem->hosts, host, link); + subsystem->tgt->discovery_genctr++; + + return 0; +} + +int +spdk_nvmf_subsystem_remove_host(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + struct spdk_nvmf_host *host; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + host = _spdk_nvmf_subsystem_find_host(subsystem, hostnqn); + if (host == NULL) { + return -ENOENT; + } + + _spdk_nvmf_subsystem_remove_host(subsystem, host); + return 0; +} + +int +spdk_nvmf_subsystem_set_allow_any_host(struct spdk_nvmf_subsystem *subsystem, bool allow_any_host) +{ + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + subsystem->allow_any_host = allow_any_host; + + return 0; +} + +bool +spdk_nvmf_subsystem_get_allow_any_host(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->allow_any_host; +} + +bool +spdk_nvmf_subsystem_host_allowed(struct spdk_nvmf_subsystem *subsystem, const char *hostnqn) +{ + if (!hostnqn) { + return false; + } + + if (subsystem->allow_any_host) { + return true; + } + + return _spdk_nvmf_subsystem_find_host(subsystem, hostnqn) != NULL; +} + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_first_host(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->hosts); +} + + +struct spdk_nvmf_host * +spdk_nvmf_subsystem_get_next_host(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_host *prev_host) +{ + return TAILQ_NEXT(prev_host, link); +} + +const char * +spdk_nvmf_host_get_nqn(struct spdk_nvmf_host *host) +{ + return host->nqn; +} + +static struct spdk_nvmf_listener * +_spdk_nvmf_subsystem_find_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) { + return listener; + } + } + + return NULL; +} + +int +spdk_nvmf_subsystem_add_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_transport *transport; + struct spdk_nvmf_listener *listener; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + if (_spdk_nvmf_subsystem_find_listener(subsystem, trid)) { + /* Listener already exists in this subsystem */ + return 0; + } + + transport = spdk_nvmf_tgt_get_transport(subsystem->tgt, trid->trtype); + if (transport == NULL) { + SPDK_ERRLOG("Unknown transport type %d\n", trid->trtype); + return -EINVAL; + } + + listener = calloc(1, sizeof(*listener)); + if (!listener) { + return -ENOMEM; + } + + listener->trid = *trid; + listener->transport = transport; + + TAILQ_INSERT_HEAD(&subsystem->listeners, listener, link); + + return 0; +} + +int +spdk_nvmf_subsystem_remove_listener(struct spdk_nvmf_subsystem *subsystem, + const struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -EAGAIN; + } + + listener = _spdk_nvmf_subsystem_find_listener(subsystem, trid); + if (listener == NULL) { + return -ENOENT; + } + + TAILQ_REMOVE(&subsystem->listeners, listener, link); + free(listener); + + return 0; +} + +bool +spdk_nvmf_subsystem_listener_allowed(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvme_transport_id *trid) +{ + struct spdk_nvmf_listener *listener; + + if (!strcmp(subsystem->subnqn, SPDK_NVMF_DISCOVERY_NQN)) { + return true; + } + + TAILQ_FOREACH(listener, &subsystem->listeners, link) { + if (spdk_nvme_transport_id_compare(&listener->trid, trid) == 0) { + return true; + } + } + + return false; +} + +struct spdk_nvmf_listener * +spdk_nvmf_subsystem_get_first_listener(struct spdk_nvmf_subsystem *subsystem) +{ + return TAILQ_FIRST(&subsystem->listeners); +} + +struct spdk_nvmf_listener * +spdk_nvmf_subsystem_get_next_listener(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_listener *prev_listener) +{ + return TAILQ_NEXT(prev_listener, link); +} + +const struct spdk_nvme_transport_id * +spdk_nvmf_listener_get_trid(struct spdk_nvmf_listener *listener) +{ + return &listener->trid; +} + +struct subsystem_update_ns_ctx { + struct spdk_nvmf_subsystem *subsystem; + + spdk_nvmf_subsystem_state_change_done cb_fn; + void *cb_arg; +}; + +static void +subsystem_update_ns_done(struct spdk_io_channel_iter *i, int status) +{ + struct subsystem_update_ns_ctx *ctx = spdk_io_channel_iter_get_ctx(i); + + if (ctx->cb_fn) { + ctx->cb_fn(ctx->subsystem, ctx->cb_arg, status); + } + free(ctx); +} + +static void +subsystem_update_ns_on_pg(struct spdk_io_channel_iter *i) +{ + int rc; + struct subsystem_update_ns_ctx *ctx; + struct spdk_nvmf_poll_group *group; + struct spdk_nvmf_subsystem *subsystem; + + ctx = spdk_io_channel_iter_get_ctx(i); + group = spdk_io_channel_get_ctx(spdk_io_channel_iter_get_channel(i)); + subsystem = ctx->subsystem; + + rc = spdk_nvmf_poll_group_update_subsystem(group, subsystem); + spdk_for_each_channel_continue(i, rc); +} + +static int +spdk_nvmf_subsystem_update_ns(struct spdk_nvmf_subsystem *subsystem, spdk_channel_for_each_cpl cpl, + void *ctx) +{ + spdk_for_each_channel(subsystem->tgt, + subsystem_update_ns_on_pg, + ctx, + cpl); + + return 0; +} + +static void +spdk_nvmf_subsystem_ns_changed(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + spdk_nvmf_ctrlr_ns_changed(ctrlr, nsid); + } +} + +static int +_spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + struct spdk_nvmf_ns *ns; + + assert(subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED || + subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE); + + if (nsid == 0 || nsid > subsystem->max_nsid) { + return -1; + } + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return -1; + } + + ns = subsystem->ns[nsid - 1]; + if (!ns) { + return -1; + } + + subsystem->ns[nsid - 1] = NULL; + + spdk_bdev_module_release_bdev(ns->bdev); + spdk_bdev_close(ns->desc); + free(ns); + + spdk_nvmf_subsystem_ns_changed(subsystem, nsid); + + return 0; +} + +int +spdk_nvmf_subsystem_remove_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid, + spdk_nvmf_subsystem_state_change_done cb_fn, void *cb_arg) +{ + int rc; + struct subsystem_update_ns_ctx *ctx; + + rc = _spdk_nvmf_subsystem_remove_ns(subsystem, nsid); + if (rc < 0) { + return rc; + } + + ctx = calloc(1, sizeof(*ctx)); + + if (ctx == NULL) { + return -ENOMEM; + } + + ctx->subsystem = subsystem; + ctx->cb_fn = cb_fn; + ctx->cb_arg = cb_arg; + + spdk_nvmf_subsystem_update_ns(subsystem, subsystem_update_ns_done, ctx); + + return 0; +} + +static void +_spdk_nvmf_ns_hot_remove_done(struct spdk_nvmf_subsystem *subsystem, void *cb_arg, int status) +{ + if (status != 0) { + SPDK_ERRLOG("Failed to make changes to NVMe-oF subsystem with id %u\n", subsystem->id); + } + spdk_nvmf_subsystem_resume(subsystem, NULL, NULL); +} + +static void +_spdk_nvmf_ns_hot_remove(struct spdk_nvmf_subsystem *subsystem, + void *cb_arg, int status) +{ + struct spdk_nvmf_ns *ns = cb_arg; + + spdk_nvmf_subsystem_remove_ns(subsystem, ns->opts.nsid, _spdk_nvmf_ns_hot_remove_done, + subsystem); +} + +static void +spdk_nvmf_ns_hot_remove(void *remove_ctx) +{ + struct spdk_nvmf_ns *ns = remove_ctx; + int rc; + + rc = spdk_nvmf_subsystem_pause(ns->subsystem, _spdk_nvmf_ns_hot_remove, ns); + if (rc) { + SPDK_ERRLOG("Unable to pause subsystem to process namespace removal!\n"); + } +} + +void +spdk_nvmf_ns_opts_get_defaults(struct spdk_nvmf_ns_opts *opts, size_t opts_size) +{ + /* All current fields are set to 0 by default. */ + memset(opts, 0, opts_size); +} + +/* Dummy bdev module used to to claim bdevs. */ +static struct spdk_bdev_module ns_bdev_module = { + .name = "NVMe-oF Target", +}; + +uint32_t +spdk_nvmf_subsystem_add_ns(struct spdk_nvmf_subsystem *subsystem, struct spdk_bdev *bdev, + const struct spdk_nvmf_ns_opts *user_opts, size_t opts_size) +{ + struct spdk_nvmf_ns_opts opts; + struct spdk_nvmf_ns *ns; + int rc; + + if (!(subsystem->state == SPDK_NVMF_SUBSYSTEM_INACTIVE || + subsystem->state == SPDK_NVMF_SUBSYSTEM_PAUSED)) { + return 0; + } + + spdk_nvmf_ns_opts_get_defaults(&opts, sizeof(opts)); + if (user_opts) { + memcpy(&opts, user_opts, spdk_min(sizeof(opts), opts_size)); + } + + if (spdk_mem_all_zero(&opts.uuid, sizeof(opts.uuid))) { + opts.uuid = *spdk_bdev_get_uuid(bdev); + } + + if (opts.nsid == SPDK_NVME_GLOBAL_NS_TAG) { + SPDK_ERRLOG("Invalid NSID %" PRIu32 "\n", opts.nsid); + return 0; + } + + if (opts.nsid == 0) { + /* + * NSID not specified - find a free index. + * + * If no free slots are found, opts.nsid will be subsystem->max_nsid + 1, which will + * expand max_nsid if possible. + */ + for (opts.nsid = 1; opts.nsid <= subsystem->max_nsid; opts.nsid++) { + if (_spdk_nvmf_subsystem_get_ns(subsystem, opts.nsid) == NULL) { + break; + } + } + } + + if (_spdk_nvmf_subsystem_get_ns(subsystem, opts.nsid)) { + SPDK_ERRLOG("Requested NSID %" PRIu32 " already in use\n", opts.nsid); + return 0; + } + + if (opts.nsid > subsystem->max_nsid) { + struct spdk_nvmf_ns **new_ns_array; + + /* If MaxNamespaces was specified, we can't extend max_nsid beyond it. */ + if (subsystem->max_allowed_nsid > 0 && opts.nsid > subsystem->max_allowed_nsid) { + SPDK_ERRLOG("Can't extend NSID range above MaxNamespaces\n"); + return 0; + } + + /* If a controller is connected, we can't change NN. */ + if (!TAILQ_EMPTY(&subsystem->ctrlrs)) { + SPDK_ERRLOG("Can't extend NSID range while controllers are connected\n"); + return 0; + } + + new_ns_array = realloc(subsystem->ns, sizeof(struct spdk_nvmf_ns *) * opts.nsid); + if (new_ns_array == NULL) { + SPDK_ERRLOG("Memory allocation error while resizing namespace array.\n"); + return 0; + } + + memset(new_ns_array + subsystem->max_nsid, 0, + sizeof(struct spdk_nvmf_ns *) * (opts.nsid - subsystem->max_nsid)); + subsystem->ns = new_ns_array; + subsystem->max_nsid = opts.nsid; + } + + ns = calloc(1, sizeof(*ns)); + if (ns == NULL) { + SPDK_ERRLOG("Namespace allocation failed\n"); + return 0; + } + + ns->bdev = bdev; + ns->opts = opts; + ns->subsystem = subsystem; + rc = spdk_bdev_open(bdev, true, spdk_nvmf_ns_hot_remove, ns, &ns->desc); + if (rc != 0) { + SPDK_ERRLOG("Subsystem %s: bdev %s cannot be opened, error=%d\n", + subsystem->subnqn, spdk_bdev_get_name(bdev), rc); + free(ns); + return 0; + } + rc = spdk_bdev_module_claim_bdev(bdev, ns->desc, &ns_bdev_module); + if (rc != 0) { + spdk_bdev_close(ns->desc); + free(ns); + return 0; + } + subsystem->ns[opts.nsid - 1] = ns; + + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Subsystem %s: bdev %s assigned nsid %" PRIu32 "\n", + spdk_nvmf_subsystem_get_nqn(subsystem), + spdk_bdev_get_name(bdev), + opts.nsid); + + spdk_nvmf_subsystem_ns_changed(subsystem, opts.nsid); + + return opts.nsid; +} + +static uint32_t +spdk_nvmf_subsystem_get_next_allocated_nsid(struct spdk_nvmf_subsystem *subsystem, + uint32_t prev_nsid) +{ + uint32_t nsid; + + if (prev_nsid >= subsystem->max_nsid) { + return 0; + } + + for (nsid = prev_nsid + 1; nsid <= subsystem->max_nsid; nsid++) { + if (subsystem->ns[nsid - 1]) { + return nsid; + } + } + + return 0; +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_first_ns(struct spdk_nvmf_subsystem *subsystem) +{ + uint32_t first_nsid; + + first_nsid = spdk_nvmf_subsystem_get_next_allocated_nsid(subsystem, 0); + return _spdk_nvmf_subsystem_get_ns(subsystem, first_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_next_ns(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ns *prev_ns) +{ + uint32_t next_nsid; + + next_nsid = spdk_nvmf_subsystem_get_next_allocated_nsid(subsystem, prev_ns->opts.nsid); + return _spdk_nvmf_subsystem_get_ns(subsystem, next_nsid); +} + +struct spdk_nvmf_ns * +spdk_nvmf_subsystem_get_ns(struct spdk_nvmf_subsystem *subsystem, uint32_t nsid) +{ + return _spdk_nvmf_subsystem_get_ns(subsystem, nsid); +} + +uint32_t +spdk_nvmf_ns_get_id(const struct spdk_nvmf_ns *ns) +{ + return ns->opts.nsid; +} + +struct spdk_bdev * +spdk_nvmf_ns_get_bdev(struct spdk_nvmf_ns *ns) +{ + return ns->bdev; +} + +void +spdk_nvmf_ns_get_opts(const struct spdk_nvmf_ns *ns, struct spdk_nvmf_ns_opts *opts, + size_t opts_size) +{ + memset(opts, 0, opts_size); + memcpy(opts, &ns->opts, spdk_min(sizeof(ns->opts), opts_size)); +} + +const char * +spdk_nvmf_subsystem_get_sn(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->sn; +} + +int +spdk_nvmf_subsystem_set_sn(struct spdk_nvmf_subsystem *subsystem, const char *sn) +{ + size_t len, max_len; + + max_len = sizeof(subsystem->sn) - 1; + len = strlen(sn); + if (len > max_len) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Invalid sn \"%s\": length %zu > max %zu\n", + sn, len, max_len); + return -1; + } + + if (!spdk_nvmf_valid_ascii_string(sn, len)) { + SPDK_DEBUGLOG(SPDK_LOG_NVMF, "Non-ASCII sn\n"); + SPDK_TRACEDUMP(SPDK_LOG_NVMF, "sn", sn, len); + return -1; + } + + snprintf(subsystem->sn, sizeof(subsystem->sn), "%s", sn); + + return 0; +} + +const char * +spdk_nvmf_subsystem_get_nqn(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subnqn; +} + +/* Workaround for astyle formatting bug */ +typedef enum spdk_nvmf_subtype nvmf_subtype_t; + +nvmf_subtype_t +spdk_nvmf_subsystem_get_type(struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->subtype; +} + +static uint16_t +spdk_nvmf_subsystem_gen_cntlid(struct spdk_nvmf_subsystem *subsystem) +{ + int count; + + /* + * In the worst case, we might have to try all CNTLID values between 1 and 0xFFF0 - 1 + * before we find one that is unused (or find that all values are in use). + */ + for (count = 0; count < 0xFFF0 - 1; count++) { + subsystem->next_cntlid++; + if (subsystem->next_cntlid >= 0xFFF0) { + /* The spec reserves cntlid values in the range FFF0h to FFFFh. */ + subsystem->next_cntlid = 1; + } + + /* Check if a controller with this cntlid currently exists. */ + if (spdk_nvmf_subsystem_get_ctrlr(subsystem, subsystem->next_cntlid) == NULL) { + /* Found unused cntlid */ + return subsystem->next_cntlid; + } + } + + /* All valid cntlid values are in use. */ + return 0xFFFF; +} + +int +spdk_nvmf_subsystem_add_ctrlr(struct spdk_nvmf_subsystem *subsystem, struct spdk_nvmf_ctrlr *ctrlr) +{ + ctrlr->cntlid = spdk_nvmf_subsystem_gen_cntlid(subsystem); + if (ctrlr->cntlid == 0xFFFF) { + /* Unable to get a cntlid */ + SPDK_ERRLOG("Reached max simultaneous ctrlrs\n"); + return -EBUSY; + } + + TAILQ_INSERT_TAIL(&subsystem->ctrlrs, ctrlr, link); + + return 0; +} + +void +spdk_nvmf_subsystem_remove_ctrlr(struct spdk_nvmf_subsystem *subsystem, + struct spdk_nvmf_ctrlr *ctrlr) +{ + assert(subsystem == ctrlr->subsys); + TAILQ_REMOVE(&subsystem->ctrlrs, ctrlr, link); +} + +struct spdk_nvmf_ctrlr * +spdk_nvmf_subsystem_get_ctrlr(struct spdk_nvmf_subsystem *subsystem, uint16_t cntlid) +{ + struct spdk_nvmf_ctrlr *ctrlr; + + TAILQ_FOREACH(ctrlr, &subsystem->ctrlrs, link) { + if (ctrlr->cntlid == cntlid) { + return ctrlr; + } + } + + return NULL; +} + +uint32_t +spdk_nvmf_subsystem_get_max_namespaces(const struct spdk_nvmf_subsystem *subsystem) +{ + return subsystem->max_allowed_nsid; +} diff --git a/src/spdk/lib/nvmf/transport.c b/src/spdk/lib/nvmf/transport.c new file mode 100644 index 00000000..af4660c9 --- /dev/null +++ b/src/spdk/lib/nvmf/transport.c @@ -0,0 +1,236 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "spdk/stdinc.h" + +#include "nvmf_internal.h" +#include "transport.h" + +#include "spdk/config.h" +#include "spdk/log.h" +#include "spdk/nvmf.h" +#include "spdk/queue.h" +#include "spdk/util.h" + +static const struct spdk_nvmf_transport_ops *const g_transport_ops[] = { +#ifdef SPDK_CONFIG_RDMA + &spdk_nvmf_transport_rdma, +#endif +}; + +#define NUM_TRANSPORTS (SPDK_COUNTOF(g_transport_ops)) + +static inline const struct spdk_nvmf_transport_ops * +spdk_nvmf_get_transport_ops(enum spdk_nvme_transport_type type) +{ + size_t i; + for (i = 0; i != NUM_TRANSPORTS; i++) { + if (g_transport_ops[i]->type == type) { + return g_transport_ops[i]; + } + } + return NULL; +} + +struct spdk_nvmf_transport * +spdk_nvmf_transport_create(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops = NULL; + struct spdk_nvmf_transport *transport; + + if ((opts->max_io_size % opts->io_unit_size != 0) || + (opts->max_io_size / opts->io_unit_size > + SPDK_NVMF_MAX_SGL_ENTRIES)) { + SPDK_ERRLOG("%s: invalid IO size, MaxIO:%d, UnitIO:%d, MaxSGL:%d\n", + spdk_nvme_transport_id_trtype_str(type), + opts->max_io_size, + opts->io_unit_size, + SPDK_NVMF_MAX_SGL_ENTRIES); + return NULL; + } + + ops = spdk_nvmf_get_transport_ops(type); + if (!ops) { + SPDK_ERRLOG("Transport type %s unavailable.\n", + spdk_nvme_transport_id_trtype_str(type)); + return NULL; + } + + transport = ops->create(opts); + if (!transport) { + SPDK_ERRLOG("Unable to create new transport of type %s\n", + spdk_nvme_transport_id_trtype_str(type)); + return NULL; + } + + transport->ops = ops; + transport->opts = *opts; + + return transport; +} + +int +spdk_nvmf_transport_destroy(struct spdk_nvmf_transport *transport) +{ + return transport->ops->destroy(transport); +} + +int +spdk_nvmf_transport_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + return transport->ops->listen(transport, trid); +} + +int +spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid) +{ + return transport->ops->stop_listen(transport, trid); +} + +void +spdk_nvmf_transport_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn) +{ + transport->ops->accept(transport, cb_fn); +} + +void +spdk_nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry) +{ + transport->ops->listener_discover(transport, trid, entry); +} + +struct spdk_nvmf_transport_poll_group * +spdk_nvmf_transport_poll_group_create(struct spdk_nvmf_transport *transport) +{ + struct spdk_nvmf_transport_poll_group *group; + + group = transport->ops->poll_group_create(transport); + group->transport = transport; + + return group; +} + +void +spdk_nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group) +{ + group->transport->ops->poll_group_destroy(group); +} + +int +spdk_nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair) +{ + if (qpair->transport) { + assert(qpair->transport == group->transport); + if (qpair->transport != group->transport) { + return -1; + } + } else { + qpair->transport = group->transport; + } + + return group->transport->ops->poll_group_add(group, qpair); +} + +int +spdk_nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group) +{ + return group->transport->ops->poll_group_poll(group); +} + +int +spdk_nvmf_transport_req_free(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_free(req); +} + +int +spdk_nvmf_transport_req_complete(struct spdk_nvmf_request *req) +{ + return req->qpair->transport->ops->req_complete(req); +} + +void +spdk_nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair) +{ + qpair->transport->ops->qpair_fini(qpair); +} + +bool +spdk_nvmf_transport_qpair_is_idle(struct spdk_nvmf_qpair *qpair) +{ + return qpair->transport->ops->qpair_is_idle(qpair); +} + +int +spdk_nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_peer_trid(qpair, trid); +} + +int +spdk_nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_local_trid(qpair, trid); +} + +int +spdk_nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid) +{ + return qpair->transport->ops->qpair_get_listen_trid(qpair, trid); +} + +bool +spdk_nvmf_transport_opts_init(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts) +{ + const struct spdk_nvmf_transport_ops *ops; + + ops = spdk_nvmf_get_transport_ops(type); + if (!ops) { + SPDK_ERRLOG("Transport type %s unavailable.\n", + spdk_nvme_transport_id_trtype_str(type)); + return false; + } + + ops->opts_init(opts); + return true; +} diff --git a/src/spdk/lib/nvmf/transport.h b/src/spdk/lib/nvmf/transport.h new file mode 100644 index 00000000..1329a80c --- /dev/null +++ b/src/spdk/lib/nvmf/transport.h @@ -0,0 +1,200 @@ +/*- + * BSD LICENSE + * + * Copyright (c) Intel Corporation. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SPDK_NVMF_TRANSPORT_H +#define SPDK_NVMF_TRANSPORT_H + +#include "spdk/stdinc.h" + +#include "spdk/nvme.h" +#include "spdk/nvmf.h" + +struct spdk_nvmf_transport { + struct spdk_nvmf_tgt *tgt; + const struct spdk_nvmf_transport_ops *ops; + struct spdk_nvmf_transport_opts opts; + + TAILQ_ENTRY(spdk_nvmf_transport) link; +}; + +struct spdk_nvmf_transport_ops { + /** + * Transport type + */ + enum spdk_nvme_transport_type type; + + /** + * Initialize transport options to default value + */ + void (*opts_init)(struct spdk_nvmf_transport_opts *opts); + + /** + * Create a transport for the given transport opts + */ + struct spdk_nvmf_transport *(*create)(struct spdk_nvmf_transport_opts *opts); + + /** + * Destroy the transport + */ + int (*destroy)(struct spdk_nvmf_transport *transport); + + /** + * Instruct the transport to accept new connections at the address + * provided. This may be called multiple times. + */ + int (*listen)(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + + /** + * Stop accepting new connections at the given address. + */ + int (*stop_listen)(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + + /** + * Check for new connections on the transport. + */ + void (*accept)(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn); + + /** + * Fill out a discovery log entry for a specific listen address. + */ + void (*listener_discover)(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry); + + /** + * Create a new poll group + */ + struct spdk_nvmf_transport_poll_group *(*poll_group_create)(struct spdk_nvmf_transport *transport); + + /** + * Destroy a poll group + */ + void (*poll_group_destroy)(struct spdk_nvmf_transport_poll_group *group); + + /** + * Add a qpair to a poll group + */ + int (*poll_group_add)(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + + /** + * Poll the group to process I/O + */ + int (*poll_group_poll)(struct spdk_nvmf_transport_poll_group *group); + + /* + * Free the request without sending a response + * to the originator. Release memory tied to this request. + */ + int (*req_free)(struct spdk_nvmf_request *req); + + /* + * Signal request completion, which sends a response + * to the originator. + */ + int (*req_complete)(struct spdk_nvmf_request *req); + + /* + * Deinitialize a connection. + */ + void (*qpair_fini)(struct spdk_nvmf_qpair *qpair); + + /* + * True if the qpair has no pending IO. + */ + bool (*qpair_is_idle)(struct spdk_nvmf_qpair *qpair); + + /* + * Get the peer transport ID for the queue pair. + */ + int (*qpair_get_peer_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + + /* + * Get the local transport ID for the queue pair. + */ + int (*qpair_get_local_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + + /* + * Get the listener transport ID that accepted this qpair originally. + */ + int (*qpair_get_listen_trid)(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); +}; + + +int spdk_nvmf_transport_stop_listen(struct spdk_nvmf_transport *transport, + const struct spdk_nvme_transport_id *trid); + +void spdk_nvmf_transport_accept(struct spdk_nvmf_transport *transport, new_qpair_fn cb_fn); + +void spdk_nvmf_transport_listener_discover(struct spdk_nvmf_transport *transport, + struct spdk_nvme_transport_id *trid, + struct spdk_nvmf_discovery_log_page_entry *entry); + +struct spdk_nvmf_transport_poll_group *spdk_nvmf_transport_poll_group_create( + struct spdk_nvmf_transport *transport); + +void spdk_nvmf_transport_poll_group_destroy(struct spdk_nvmf_transport_poll_group *group); + +int spdk_nvmf_transport_poll_group_add(struct spdk_nvmf_transport_poll_group *group, + struct spdk_nvmf_qpair *qpair); + +int spdk_nvmf_transport_poll_group_poll(struct spdk_nvmf_transport_poll_group *group); + +int spdk_nvmf_transport_req_free(struct spdk_nvmf_request *req); + +int spdk_nvmf_transport_req_complete(struct spdk_nvmf_request *req); + +void spdk_nvmf_transport_qpair_fini(struct spdk_nvmf_qpair *qpair); + +bool spdk_nvmf_transport_qpair_is_idle(struct spdk_nvmf_qpair *qpair); + +int spdk_nvmf_transport_qpair_get_peer_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int spdk_nvmf_transport_qpair_get_local_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +int spdk_nvmf_transport_qpair_get_listen_trid(struct spdk_nvmf_qpair *qpair, + struct spdk_nvme_transport_id *trid); + +bool spdk_nvmf_transport_opts_init(enum spdk_nvme_transport_type type, + struct spdk_nvmf_transport_opts *opts); + +extern const struct spdk_nvmf_transport_ops spdk_nvmf_transport_rdma; + +#endif /* SPDK_NVMF_TRANSPORT_H */ |